{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 发送GET请求\n", "response = requests.get(url, headers=headers)\n", "response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", "# 解析HTML内容\n", "soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", "# 使用CSS选择器查找特定元素\n", "selector = '#JinFlashList > div.flash-top > div.flash-top_right > div.flash-top_tool.tw-change-box > div.tool-setting.hide-dot > span:nth-child(2) > div.setting-popup > div:nth-child(3) > div > div.hot-filter_bot > div:nth-child(3)'\n", "element = soup.select_one(selector)\n", "\n", "# 检查元素是否存在并打印内容\n", "if element:\n", " print(element.prettify())\n", "else:\n", " print(\"未找到指定的元素\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 发送GET请求\n", "response = requests.get(url, headers=headers)\n", "response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", "# 解析HTML内容\n", "soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", "# 查找所有符合条件的元素\n", "items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", "# 提取满足条件的元素\n", "filtered_items = []\n", "for item in items:\n", " hot_labels = item.select('.hot-filter_item_label')\n", " for label in hot_labels:\n", " if label.get_text(strip=True) == '爆':\n", " filtered_items.append(item)\n", " break\n", "\n", "# 打印提取的元素内容\n", "for item in filtered_items:\n", " time = item.find('div', class_='item-time').get_text(strip=True)\n", " content = item.find('div', class_='flash-text').get_text(strip=True)\n", " print(f\"时间: {time}\")\n", " print(f\"内容: {content}\")\n", " print('-' * 40)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 发送GET请求\n", "response = requests.get(url, headers=headers)\n", "response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", "# 解析HTML内容\n", "soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", "# 查找所有符合条件的元素\n", "items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", "# 提取满足条件的元素\n", "filtered_items = []\n", "for item in items:\n", " hot_labels = item.select('.hot-filter_item_label')#hot-filter_item_label\n", " for label in hot_labels:\n", " if label.get_text(strip=True) == '火':\n", " filtered_items.append(item)\n", " if len(filtered_items) >= 20:\n", " break\n", " if len(filtered_items) >= 20:\n", " break\n", "\n", "# 打印提取的20条元素内容\n", "for i, item in enumerate(filtered_items, 1):\n", " time = item.find('div', class_='item-time').get_text(strip=True)\n", " content = item.find('div', class_='flash-text').get_text(strip=True)\n", " print(f\"消息 {i}:\")\n", " print(f\"时间: {time}\")\n", " print(f\"内容: {content}\")\n", " print('-' * 40)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 发送GET请求\n", "response = requests.get(url, headers=headers)\n", "response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", "# 解析HTML内容\n", "soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", "# 查找所有符合条件的元素\n", "items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", "# 提取满足条件的元素\n", "filtered_items = []\n", "for item in items:\n", " if item.find('i', class_='jin-icon iconfont icon-huo is-huo'):\n", " filtered_items.append(item)\n", " if len(filtered_items) >= 20:\n", " break\n", "\n", "# 打印提取的20条元素内容\n", "for i, item in enumerate(filtered_items, 1):\n", " time = item.find('div', class_='item-time').get_text(strip=True)\n", " content = item.find('div', class_='flash-text').get_text(strip=True)\n", " print(f\"消息 {i}:\")\n", " print(f\"时间: {time}\")\n", " print(f\"内容: {content}\")\n", " print('-' * 40)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import smtplib\n", "from email.mime.multipart import MIMEMultipart\n", "from email.mime.text import MIMEText\n", "import schedule\n", "import time\n", "\n", "# 发件人邮箱和密码\n", "from_email = \"240884432@qq.com\"\n", "from_password = \"osjyjmbqrzxtbjbf\"\n", "to_email = \"240884432@qq.com\"\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 上一次获取的信息\n", "last_items = []\n", "\n", "# 发送邮件的函数\n", "def send_email(subject, content, to_email):\n", " msg = MIMEMultipart('alternative')\n", " msg['Subject'] = subject\n", " msg['From'] = from_email\n", " msg['To'] = to_email\n", " msg.attach(MIMEText(content, 'html'))\n", "\n", " try:\n", " server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 这里使用SSL,端口通常是465\n", " server.login(from_email, from_password)\n", " server.sendmail(from_email, to_email, msg.as_string())\n", " server.quit()\n", " print(\"邮件发送成功\")\n", " except Exception as e:\n", " print(f\"邮件发送失败: {e}\")\n", "\n", "# 爬取并发送邮件的函数\n", "def fetch_and_notify():\n", " global last_items\n", " # 发送GET请求\n", " response = requests.get(url, headers=headers)\n", " response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", " # 解析HTML内容\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " # 查找所有符合条件的元素\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " filtered_items = []\n", " for item in items:\n", " if item.find('i', class_='jin-icon iconfont icon-huo is-huo'):\n", " filtered_items.append(item)\n", " if len(filtered_items) >= 20:\n", " break\n", "\n", " if not filtered_items:\n", " return\n", "\n", " new_items = [item for item in filtered_items if item not in last_items]\n", " print(filtered_items)\n", " print(new_items)\n", "\n", " if new_items:\n", " email_content = \"\"\n", " for i, item in enumerate(new_items, 1):\n", " time = item.find('div', class_='item-time').get_text(strip=True)\n", " content = item.find('div', class_='flash-text').get_text(strip=True)\n", " email_content += f\"
消息 {i}:
时间: {time}
内容: {content}
没有新的市场快讯信息。
\", to_email)\n", "\n", "# 每5分钟运行一次\n", "schedule.every(1).minutes.do(fetch_and_notify)\n", "\n", "print(\"开始监控市场快讯信息...\")\n", "while True:\n", " schedule.run_pending()\n", " time.sleep(1)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import smtplib\n", "from email.mime.multipart import MIMEMultipart\n", "from email.mime.text import MIMEText\n", "import schedule\n", "import time\n", "\n", "# 邮箱配置\n", "from_email = \"240884432@qq.com\"\n", "from_password = \"osjyjmbqrzxtbjbf\"\n", "to_email = \"240884432@qq.com\"\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 上一次获取的信息\n", "last_items = []\n", "\n", "# 发送邮件的函数\n", "def send_email(subject, content, to_email):\n", " msg = MIMEMultipart('alternative')\n", " msg['Subject'] = subject\n", " msg['From'] = from_email\n", " msg['To'] = to_email\n", " msg.attach(MIMEText(content, 'html'))\n", "\n", " try:\n", " server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 这里使用SSL,端口通常是465\n", " server.login(from_email, from_password)\n", " server.sendmail(from_email, to_email, msg.as_string())\n", " server.quit()\n", " print(\"邮件发送成功\")\n", " except Exception as e:\n", " print(f\"邮件发送失败: {e}\")\n", "\n", "# 爬取并发送邮件的函数\n", "def fetch_and_notify():\n", " global last_items\n", " response = requests.get(url, headers=headers)\n", " response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " filtered_items = []\n", " for item in items:\n", " if item.find('i', class_='jin-icon iconfont icon-huo is-huo'):\n", " filtered_items.append(item)\n", " if len(filtered_items) >= 10:\n", " break\n", "\n", " if not filtered_items:\n", " return\n", "\n", " new_items = [item for item in filtered_items if item not in last_items]\n", "\n", " if new_items:\n", " email_content = \"\"\n", " for i, item in enumerate(new_items, 1):\n", " time_text = item.find('div', class_='item-time').get_text(strip=True)\n", " content_text = item.find('div', class_='flash-text').get_text(strip=True)\n", " email_content += f\"消息 {i}:
时间: {time_text}
内容: {content_text}
没有新的市场快讯信息。
\", to_email)\n", "\n", "# 每5分钟运行一次\n", "schedule.every(1).minutes.do(fetch_and_notify)\n", "\n", "print(\"开始监控市场快讯信息...\")\n", "while True:\n", " schedule.run_pending()\n", " time.sleep(1)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "import smtplib\n", "from email.mime.multipart import MIMEMultipart\n", "from email.mime.text import MIMEText\n", "import schedule\n", "import time\n", "\n", "# 邮箱配置\n", "from_email = \"240884432@qq.com\"\n", "from_password = \"osjyjmbqrzxtbjbf\"\n", "to_email = \"240884432@qq.com\"\n", "\n", "# 目标URL\n", "url = 'https://www.jin10.com/'\n", "\n", "# 设置请求头,模拟浏览器访问\n", "headers = {\n", " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'\n", "}\n", "\n", "# 上一次获取的消息ID列表\n", "last_item_ids = []\n", "\n", "# 发送邮件的函数\n", "def send_email(subject, content, to_email):\n", " msg = MIMEMultipart('alternative')\n", " msg['Subject'] = subject\n", " msg['From'] = from_email\n", " msg['To'] = to_email\n", " msg.attach(MIMEText(content, 'html'))\n", "\n", " try:\n", " server = smtplib.SMTP_SSL('smtp.qq.com', 465) # 使用SSL,端口通常是465\n", " server.login(from_email, from_password)\n", " server.sendmail(from_email, to_email, msg.as_string())\n", " server.quit()\n", " print(\"邮件发送成功\")\n", " except Exception as e:\n", " print(f\"邮件发送失败: {e}\")\n", "\n", "# 爬取并发送邮件的函数\n", "def fetch_and_notify():\n", " global last_item_ids\n", " response = requests.get(url, headers=headers)\n", " response.raise_for_status() # 如果请求失败则抛出异常\n", "\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " filtered_items = []\n", " new_item_ids = []\n", " \n", " for item in items:\n", " item_id = item.get('id')\n", " if item.find('i', class_='flash-hot_text is-fei'):# jin-icon iconfont icon-huo is-huo\n", " filtered_items.append(item)\n", " new_item_ids.append(item_id)\n", " if len(filtered_items) >= 5:\n", " break\n", "\n", " if not filtered_items:\n", " return\n", "\n", " new_items = [item for item in filtered_items if item.get('id') not in last_item_ids]\n", "\n", " if new_items:\n", " email_content = \"\"\n", " for i, item in enumerate(new_items, 1):\n", " time_text = item.find('div', class_='item-time').get_text(strip=True)\n", " content_text = item.find('div', class_='flash-text').get_text(strip=True)\n", " email_content += f\"消息 {i}:
时间: {time_text}
内容: {content_text}
没有新的市场快讯信息。
\", to_email)\n", "\n", "# 每5分钟运行一次\n", "schedule.every(1).minutes.do(fetch_and_notify)\n", "\n", "print(\"开始监控市场快讯信息...\")\n", "while True:\n", " schedule.run_pending()\n", " time.sleep(1)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import requests\n", "import time\n", "\n", "# 初始化 WebDriver\n", "options = webdriver.ChromeOptions()\n", "options.add_argument('--headless') # 无头模式,不打开浏览器界面\n", "driver = webdriver.Chrome(options=options)\n", "\n", "try:\n", " # 打开金十数据网站\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " # 等待网页加载并找到目标元素\n", " target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n", " target_element = WebDriverWait(driver, 10).until(\n", " EC.presence_of_element_located((By.XPATH, target_xpath))\n", " )\n", " # print(target_element)\n", "\n", " # 使用 JavaScript 修改元素的 class class=\"hot-filter_item is-active\"\n", " driver.execute_script('arguments[0].className = \"hot-filter_item is-active\";', target_element)\n", "\n", " # 等待内容更新(这里等待时间可以根据需要调整)\n", " time.sleep(5)\n", "\n", " # 获取页面内容\n", " page_source = driver.page_source\n", "\n", " # 使用 BeautifulSoup 解析 HTML\n", " soup = BeautifulSoup(page_source, 'lxml')\n", "\n", " # 提取class为'jin-flash-item-container is-normal'的div标签\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " # 只爬取前5条信息\n", " for index, item in enumerate(items[:5], start=1):\n", " # print(f\"Information {index}:\")\n", " # print(item.get_text(strip=True)) # 打印提取的文本内容\n", " time = item.find('div', class_='item-time').get_text(strip=True)\n", " content = item.find('div', class_='flash-text').get_text(strip=True)\n", " print(f\"消息 {index}:\")\n", " print(f\"时间: {time}\")\n", " print(f\"内容: {content}\")\n", " print('-' * 40)\n", "\n", "finally:\n", " # 关闭 WebDriver\n", " driver.quit()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import time\n", "\n", "# 初始化 WebDriver\n", "options = webdriver.ChromeOptions()\n", "options.add_argument('--headless') # 无头模式,不打开浏览器界面\n", "driver = webdriver.Chrome(options=options)\n", "\n", "try:\n", " # 打开金十数据网站\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " # 等待网页加载并找到目标元素\n", " target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n", " WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, target_xpath))\n", " )\n", "\n", " # 使用 JavaScript 修改元素的 class\n", " script = '''\n", " var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n", " element.className = \"hot-filter_item is-active\";\n", " '''\n", " driver.execute_script(script, target_xpath)\n", "\n", " # 等待内容更新\n", " time.sleep(5)\n", "\n", " # 获取页面内容\n", " page_source = driver.page_source\n", "\n", " # 使用 BeautifulSoup 解析 HTML\n", " soup = BeautifulSoup(page_source, 'lxml')\n", "\n", " # 提取class为'jin-flash-item-container is-normal'的div标签\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " # 只爬取前5条信息\n", " for index, item in enumerate(items[:5], start=1):\n", " print(f\"Information {index}:\")\n", " print(item.get_text(strip=True)) # 打印提取的文本内容\n", "\n", "finally:\n", " # 关闭 WebDriver\n", " driver.quit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import time\n", "\n", "# 初始化 WebDriver\n", "options = webdriver.ChromeOptions()\n", "options.add_argument('--headless') # 无头模式,不打开浏览器界面\n", "driver = webdriver.Chrome(options=options)\n", "\n", "try:\n", " # 打开金十数据网站\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " # 等待网页加载并找到目标元素\n", " target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n", " target_element = WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, target_xpath))\n", " )\n", "\n", " # 使用 JavaScript 修改元素的 class\n", " script = '''\n", " var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n", " element.className = \"hot-filter_item is-active\";\n", " '''\n", " driver.execute_script(script, target_xpath)\n", "\n", " # 立即更新内容,假设更新内容的位置是已知的\n", " update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'\n", " update_element = WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, update_xpath))\n", " )\n", " \n", " # 触发页面更新,这里假设更新内容是通过某种操作,比如点击按钮或其他方式\n", " # 请根据实际情况调整这里的操作\n", " driver.execute_script('arguments[0].scrollIntoView(true);', update_element)\n", " time.sleep(5) # 等待页面内容更新\n", "\n", " # 获取页面内容\n", " page_source = driver.page_source\n", "\n", " # 使用 BeautifulSoup 解析 HTML\n", " soup = BeautifulSoup(page_source, 'lxml')\n", "\n", " # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " # 只爬取前5条信息\n", " for index, item in enumerate(items[:40], start=1):\n", " print(f\"Information {index}:\")\n", " print(item.get_text(strip=True)) # 打印提取的文本内容\n", "\n", "finally:\n", " # 关闭 WebDriver\n", " driver.quit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import time\n", "\n", "# 初始化 WebDriver\n", "options = webdriver.ChromeOptions()\n", "options.add_argument('--headless') # 无头模式,不打开浏览器界面\n", "driver = webdriver.Chrome(options=options)\n", "\n", "try:\n", " # 打开金十数据网站\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " # 等待网页加载并找到目标按钮\n", " button_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n", " button_element = WebDriverWait(driver, 20).until(\n", " EC.element_to_be_clickable((By.XPATH, button_xpath))\n", " )\n", "\n", " # 点击按钮\n", " button_element.click()\n", "\n", " # 等待页面内容更新\n", " time.sleep(30) # 根据需要调整等待时间\n", "\n", " # 获取页面内容\n", " page_source = driver.page_source\n", "\n", " # 使用 BeautifulSoup 解析 HTML\n", " soup = BeautifulSoup(page_source, 'lxml')\n", "\n", " # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " # 只爬取前5条信息\n", " for index, item in enumerate(items[:5], start=1):\n", " print(f\"Information {index}:\")\n", " print(item.get_text(strip=True)) # 打印提取的文本内容\n", "\n", "finally:\n", " # 关闭 WebDriver\n", " driver.quit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import time\n", "\n", "# 初始化 WebDriver\n", "options = webdriver.ChromeOptions()\n", "options.add_argument('--headless') # 无头模式,不打开浏览器界面\n", "driver = webdriver.Chrome(options=options)\n", "\n", "try:\n", " # 打开金十数据网站\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " # 等待网页加载并找到目标元素\n", " target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n", " target_element = WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, target_xpath))\n", " )\n", "\n", " # 使用 JavaScript 修改元素的 class\n", " script_modify_class = '''\n", " var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n", " element.className = \"hot-filter_item is-active\";\n", " '''\n", " driver.execute_script(script_modify_class, target_xpath)\n", "\n", " # 触发页面内容更新\n", " update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'\n", " update_element = WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, update_xpath))\n", " )\n", "\n", " # 使用 JavaScript 来调用更新方法\n", " script_update_content = '''\n", " var updateElement = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n", " updateElement.classList.add(\"el-loading-parent--relative\");\n", " // 触发实际的内容更新逻辑,这可能需要根据实际页面的JavaScript实现来进行调整\n", " '''\n", " driver.execute_script(script_update_content, update_xpath)\n", "\n", " # 等待页面内容更新\n", " time.sleep(5) # 根据需要调整等待时间\n", "\n", " # 获取页面内容\n", " page_source = driver.page_source\n", "\n", " # 使用 BeautifulSoup 解析 HTML\n", " soup = BeautifulSoup(page_source, 'lxml')\n", "\n", " # 提取 class 为 'jin-flash-item-container is-normal' 的 div 标签\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " # 只爬取前5条信息\n", " for index, item in enumerate(items[:5], start=1):\n", " print(f\"Information {index}:\")\n", " print(item.get_text(strip=True)) # 打印提取的文本内容\n", "\n", "finally:\n", " # 关闭 WebDriver\n", " driver.quit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.chrome.service import Service\n", "from selenium.webdriver.chrome.options import Options\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import time\n", "\n", "# 初始化 WebDriver\n", "options = Options()\n", "options.add_argument('--headless') # 无头模式,不打开浏览器界面\n", "options.add_argument('--disable-gpu')\n", "# service = Service('path/to/chromedriver') # 替换为你的ChromeDriver路径\n", "driver = webdriver.Chrome(options=options)# (service=service, options=options)\n", "\n", "try:\n", " # 打开金十数据网站\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " # 等待页面加载\n", " time.sleep(5)\n", "\n", " # 模拟点击滑块\n", " switch_element_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]'\n", " switch_element = driver.find_element(By.XPATH, switch_element_xpath)\n", " switch_element.click()\n", "\n", " # 等待新内容加载\n", " time.sleep(5) # 可以调整时间,确保新内容加载完成\n", "\n", " # 获取新的页面内容\n", " new_page_source = driver.page_source\n", "\n", " # 使用 BeautifulSoup 解析新的 HTML\n", " soup = BeautifulSoup(new_page_source, 'lxml')\n", "\n", " # 提取 class 为 'jin-flash-item-container is-important' 的 div 标签\n", " items = soup.find_all('div', class_='jin-flash-item-container is-important')\n", "\n", " # 只爬取前5条信息\n", " for index, item in enumerate(items[:5], start=1):\n", " print(f\"Information {index}:\")\n", " print(item.get_text(strip=True)) # 打印提取的文本内容\n", " print('-' * 50)\n", "\n", "finally:\n", " # 关闭 WebDriver\n", " driver.quit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "开始定时监控...\n", "火:现货黄金日内走低1.00%,现报3102.30美元/盎司。\n", "last_matched_hash 968c113779225800e7593a2e594ad0e0\n", "current_hash 968c113779225800e7593a2e594ad0e0\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "火:现货黄金日内走低1.00%,现报3102.30美元/盎司。\n", "last_matched_hash 968c113779225800e7593a2e594ad0e0\n", "current_hash 968c113779225800e7593a2e594ad0e0\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "火:现货黄金日内走低1.00%,现报3102.30美元/盎司。\n", "last_matched_hash 968c113779225800e7593a2e594ad0e0\n", "current_hash 968c113779225800e7593a2e594ad0e0\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "火:现货黄金日内走低1.00%,现报3102.30美元/盎司。\n", "last_matched_hash 968c113779225800e7593a2e594ad0e0\n", "current_hash 968c113779225800e7593a2e594ad0e0\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "火:现货黄金日内走低1.00%,现报3102.30美元/盎司。\n", "last_matched_hash 968c113779225800e7593a2e594ad0e0\n", "current_hash 968c113779225800e7593a2e594ad0e0\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "执行出错: Message: \n", "Stacktrace:\n", "\tGetHandleVerifier [0x00007FF6B5614C25+3179557]\n", "\t(No symbol) [0x00007FF6B52788A0]\n", "\t(No symbol) [0x00007FF6B51091CA]\n", "\t(No symbol) [0x00007FF6B515FA67]\n", "\t(No symbol) [0x00007FF6B515FC9C]\n", "\t(No symbol) [0x00007FF6B51B3627]\n", "\t(No symbol) [0x00007FF6B5187C6F]\n", "\t(No symbol) [0x00007FF6B51B02F3]\n", "\t(No symbol) [0x00007FF6B5187A03]\n", "\t(No symbol) [0x00007FF6B51506D0]\n", "\t(No symbol) [0x00007FF6B5151983]\n", "\tGetHandleVerifier [0x00007FF6B56767CD+3579853]\n", "\tGetHandleVerifier [0x00007FF6B568D1D2+3672530]\n", "\tGetHandleVerifier [0x00007FF6B5682153+3627347]\n", "\tGetHandleVerifier [0x00007FF6B53E092A+868650]\n", "\t(No symbol) [0x00007FF6B5282FFF]\n", "\t(No symbol) [0x00007FF6B527F4A4]\n", "\t(No symbol) [0x00007FF6B527F646]\n", "\t(No symbol) [0x00007FF6B526EAA9]\n", "\tBaseThreadInitThunk [0x00007FFF903C259D+29]\n", "\tRtlUserThreadStart [0x00007FFF90F4AF38+40]\n", "\n", "热:纽约期银暴跌6%金十数据4月3日讯,纽约期银日内暴跌6.00%,现报32.57美元/盎司。现货白银跌4.43%,报32.40美元/盎司。\n", "邮件发送成功\n", "热:纽约期银暴跌6%金十数据4月3日讯,纽约期银日内暴跌6.00%,现报32.57美元/盎司。现货白银跌4.43%,报32.40美元/盎司。\n", "last_matched_hash c2e035a6e60f47a267833722604e1a24\n", "current_hash c2e035a6e60f47a267833722604e1a24\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:纽约期银暴跌6%金十数据4月3日讯,纽约期银日内暴跌6.00%,现报32.57美元/盎司。现货白银跌4.43%,报32.40美元/盎司。\n", "last_matched_hash c2e035a6e60f47a267833722604e1a24\n", "current_hash c2e035a6e60f47a267833722604e1a24\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "热:纽约期银暴跌6%金十数据4月3日讯,纽约期银日内暴跌6.00%,现报32.57美元/盎司。现货白银跌4.43%,报32.40美元/盎司。\n", "last_matched_hash c2e035a6e60f47a267833722604e1a24\n", "current_hash c2e035a6e60f47a267833722604e1a24\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "热:纽约期银暴跌6%金十数据4月3日讯,纽约期银日内暴跌6.00%,现报32.57美元/盎司。现货白银跌4.43%,报32.40美元/盎司。\n", "last_matched_hash c2e035a6e60f47a267833722604e1a24\n", "current_hash c2e035a6e60f47a267833722604e1a24\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "热:纽约期银暴跌6%金十数据4月3日讯,纽约期银日内暴跌6.00%,现报32.57美元/盎司。现货白银跌4.43%,报32.40美元/盎司。\n", "last_matched_hash c2e035a6e60f47a267833722604e1a24\n", "current_hash c2e035a6e60f47a267833722604e1a24\n", "\n", "共找到 2 条匹配'默认火热沸爆'的信息\n", "火:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "邮件发送成功\n", "火:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash b8daf18e3735013ff4c9c47aeca33fa5\n", "current_hash b8daf18e3735013ff4c9c47aeca33fa5\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "邮件发送成功\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n", "热:黄金快速下破3090美元金十数据4月3日讯,现货黄金15分钟内快速下破两道关口,最低至3089.57美元/盎司,日内跌1.41%。\n", "last_matched_hash e00714f0ae93195b10a76db627c95bbe\n", "current_hash e00714f0ae93195b10a76db627c95bbe\n", "\n", "共找到 1 条匹配'默认火热沸爆'的信息\n" ] } ], "source": [ "from selenium import webdriver\n", "from selenium.webdriver.common.by import By\n", "from selenium.webdriver.support.ui import WebDriverWait\n", "from selenium.webdriver.support import expected_conditions as EC\n", "from bs4 import BeautifulSoup\n", "import time\n", "import smtplib\n", "from email.mime.text import MIMEText\n", "from email.mime.multipart import MIMEMultipart\n", "import hashlib\n", "import schedule\n", "\n", "from_email = \"240884432@qq.com\"\n", "from_password = \"osjyjmbqrzxtbjbf\"\n", "to_email = \"240884432@qq.com\"\n", "\n", "# 邮件发送函数\n", "def send_email(content):\n", " msg = MIMEMultipart('alternative')\n", " msg['Subject'] = '金十数据更新通知'\n", " msg['From'] = from_email\n", " msg['To'] = to_email\n", " msg.attach(MIMEText(content, 'html'))\n", "\n", " try:\n", " server = smtplib.SMTP_SSL('smtp.qq.com', 465)\n", " server.login(from_email, from_password)\n", " server.sendmail(from_email, to_email, msg.as_string())\n", " server.quit()\n", " print(\"邮件发送成功\")\n", " except Exception as e:\n", " print(f\"邮件发送失败: {e}\")\n", "\n", "# 全局存储上次匹配内容\n", "last_matched_hash = None\n", "\n", "# 主抓取函数\n", "def fetch_news():\n", " global last_matched_hash\n", " \n", " options = webdriver.ChromeOptions()\n", " options.add_argument('--headless')\n", " driver = webdriver.Chrome(options=options)\n", "\n", " try:\n", " driver.get(\"https://www.jin10.com/\")\n", "\n", " target_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[2]/div[2]/div[2]/div[4]/span[2]/div[1]/div[3]/div/div[2]/div[3]'\n", " target_element = WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, target_xpath))\n", " )\n", "\n", " script = '''\n", " var element = document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue;\n", " element.className = \"hot-filter_item is-active\";\n", " '''\n", " driver.execute_script(script, target_xpath)\n", "\n", " update_xpath = '/html/body/div[1]/div[2]/div[2]/div/main/div[2]/div[2]/div[3]'\n", " update_element = WebDriverWait(driver, 20).until(\n", " EC.presence_of_element_located((By.XPATH, update_xpath))\n", " )\n", " driver.execute_script('arguments[0].scrollIntoView(true);', update_element)\n", " time.sleep(5)\n", "\n", " page_source = driver.page_source\n", " soup = BeautifulSoup(page_source, 'lxml')\n", " items = soup.find_all('div', class_='jin-flash-item-container is-normal')\n", "\n", " matched_count = 0\n", " modified_text = ''\n", " \n", " for item in items[:40]:\n", " text_content = item.get_text(strip=True)\n", " # print(text_content)\n", " if '默认火热沸爆' not in text_content:\n", " continue\n", " \n", " matched_count += 1\n", " parts = text_content.split('默认火热沸爆', 1)\n", " if len(parts) > 1 and parts[1].strip():\n", " modified_text = parts[1].strip()[0] + \":\" + parts[1].strip()[1:]\n", "\n", " print(modified_text) \n", " if modified_text:\n", " current_hash = hashlib.md5(modified_text.encode()).hexdigest()\n", " if last_matched_hash and current_hash != last_matched_hash:\n", " send_email(modified_text)\n", " print(modified_text)\n", " last_matched_hash = current_hash\n", " print(\"last_matched_hash\", last_matched_hash)\n", " print(\"current_hash\", current_hash)\n", "\n", " print(f\"\\n共找到 {matched_count} 条匹配'默认火热沸爆'的信息\")\n", "\n", " except Exception as e:\n", " print(f\"执行出错: {e}\")\n", " finally:\n", " driver.quit()\n", "\n", "# 定时任务配置\n", "schedule.every(1).minutes.do(fetch_news)\n", "\n", "print(\"开始定时监控...\")\n", "while True:\n", " schedule.run_pending()\n", " time.sleep(1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 2 }