#!/usr/bin/python3 # -*- coding: utf-8 -*- # Created by: Niuyoucai # 2022.04.09 Ver 1.0 # https://rili.jin10.com爬虫 import requests, re, sys from datetime import * from bs4 import BeautifulSoup import smtplib import email.message class Spider(object): ''' 用于jin10网站爬虫的专用类 Created by: Niuyoucai 2022.04.09 Ver 1.0 ''' def __init__(self): self.headers = { 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.141 Safari/537.36' } self.root = 'https://rili.jin10.com/day/' self.session = requests.session() self.html = '' self.mailtext = '' def getHtmlContent(self, dt): '''读取网页文本信息用于解析 ''' self.dt = dt url = self.root + dt try: response = self.session.get(url, headers=self.headers) response.encoding = 'utf-8' self.html = response.text self.getDatas() self.getEvents() self.getHolidays() except requests.exceptions.RequestException as e: self.html = '' def getDatas(self): '''获取重要经济数据 ''' stpos = self.html.find('经济数据') topos = self.html.find('事件') html = self.html[stpos:topos] soup = BeautifulSoup(html, 'html.parser') trs = soup.find_all('tr', attrs='jin-table-row') info = '[' + self.dt + ']' + '
' + '重要经济数据' + '
' for tr in trs: if 'is-important' not in str(tr): continue tds = tr.find_all('td') for col in tds: data = re.sub('<.*?>', '', str(col)).strip() info += data info += ' ' info += '
' self.mailtext += info def getEvents(self): '''获取重要事件 ''' stpos = self.html.find('事件') topos = self.html.find('假期') html = self.html[stpos:topos] soup = BeautifulSoup(html, 'html.parser') trs = soup.find_all('tr', attrs='jin-table-row') info = '重要事件' + '
' for tr in trs: if 'is-important' not in str(tr): continue tds = tr.find_all('td') for col in tds: data = re.sub('<.*?>', '', str(col)).strip() info += data info += ' ' info += '
' self.mailtext += info def getHolidays(self): '''获取假期 ''' # html = self.getHtmlContent(self.root) stpos = self.html.find('假期') html = self.html[stpos:] soup = BeautifulSoup(html, 'html.parser') trs = soup.find_all('tr', attrs='jin-table-row current-countdown') info = '假期' + '
' for tr in trs: tds = tr.find_all('td') for col in tds: data = re.sub('<.*?>', '', str(col)).strip() info += data info += ' ' info += '
' self.mailtext += info def mailqq(self): #没有爬取到内容时不发邮件 if self.mailtext == '': return smtpObj = smtplib.SMTP_SSL('smtp.qq.com',465) msg = email.message.EmailMessage() msg.set_type('text/html') msg["From"] = "240884432@qq.com" msg["To"] = "240884432@qq.com" msg["Subject"] = "每日交易提醒" text = "" + self.mailtext + "" try: smtpObj.login('240884432@qq.com', 'ifjgwlnzdvrfbjgf')#qq邮箱的第三方授权码 #msg.set_content(content) msg.add_alternative(text, subtype='html') smtpObj.send_message(msg) except smtplib.SMTPException as e: print(repr(e)) finally: smtpObj.quit() def main(): #参数检查 if len(sys.argv) > 2: print('用法:', sys.argv[0], '[数据日期YYYY-MM-dd]') return #缺省爬取今明两天的数据 elif len(sys.argv) == 1: myspider = Spider() dt = datetime.strftime(datetime.now(), '%Y-%m-%d') myspider.getHtmlContent(dt) dt = datetime.strftime(datetime.now() + timedelta(days=1), '%Y-%m-%d') myspider.getHtmlContent(dt) myspider.mailqq() #参数指定日期单独一天数据爬取 elif len(sys.argv) == 2: myspider = Spider() dt = sys.argv[1] myspider.getHtmlContent(dt) myspider.mailqq() if __name__ == '__main__': main()