最近做了测试抓取XX时报的数据,由于需要事先登录,并且有验证码,关于验证码解决有两个途径:一是利用打码平台,其原理是把验证码的图片上传发送给打码平台,

  然后返回其验证码。二就是自己研究验证码技术问题。这个有时间再研究。

    目前主要是测试从XX时报抓取数据,目前暂时用了笨方法,利用人工介入,输入验证码。

    登录界面:

    python3 IEDriver抓取时报数据-LMLPHP

    具体代码如下:

  

#coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import collections
import mongoDbBase
import numpy
import imagehash
from PIL import Image,ImageFile
import datetime
class finalNews_IE:
def __init__(self,strdate,logonUrl,firstUrl,keyword_list,exportPath,codedir):
self.iniDriver()
self.db = mongoDbBase.mongoDbBase()
self.date = strdate
self.firstUrl = firstUrl
self.logonUrl = logonUrl
self.keyword_list = keyword_list
self.exportPath = exportPath
self.codedir = codedir
self.hash_code_dict ={} def iniDriver(self):
# 通过配置文件获取IEDriverServer.exe路径
IEDriverServer = "C:\Program Files\Internet Explorer\IEDriverServer.exe"
os.environ["webdriver.ie.driver"] = IEDriverServer
self.driver = webdriver.Ie(IEDriverServer) def WriteData(self, message, fileName):
fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
with open(fileName, 'a') as f:
f.write(message) # 获取图片文件的hash值
def get_ImageHash(self,imagefile):
hash = None
if os.path.exists(imagefile):
with open(imagefile, 'rb') as fp:
hash = imagehash.average_hash(Image.open(fp))
return hash # 点降噪
def clearNoise(self, imageFile, x=0, y=0):
if os.path.exists(imageFile):
image = Image.open(imageFile)
image = image.convert('L')
image = numpy.asarray(image)
image = (image > 135) * 255
image = Image.fromarray(image).convert('RGB')
# save_name = "D:\work\python36_crawl\Veriycode\mode_5590.png"
# image.save(save_name)
image.save(imageFile)
return image #切割验证码
# rownum:切割行数;colnum:切割列数;dstpath:图片文件路径;img_name:要切割的图片文件
def splitimage(self, imagePath,imageFile,rownum=1, colnum=4):
img = Image.open(imageFile)
w, h = img.size
if rownum <= h and colnum <= w:
print('Original image info: %sx%s, %s, %s' % (w, h, img.format, img.mode))
print('开始处理图片切割, 请稍候...') s = os.path.split(imageFile)
if imagePath == '':
dstpath = s[0]
fn = s[1].split('.')
basename = fn[0]
ext = fn[-1] num = 1
rowheight = h // rownum
colwidth = w // colnum
file_list =[]
for r in range(rownum):
index = 0
for c in range(colnum):
# (left, upper, right, lower)
# box = (c * colwidth, r * rowheight, (c + 1) * colwidth, (r + 1) * rowheight)
if index < 1:
colwid = colwidth + 6
elif index < 2:
colwid = colwidth + 1
elif index < 3:
colwid = colwidth box = (c * colwid, r * rowheight, (c + 1) * colwid, (r + 1) * rowheight)
newfile = os.path.join(imagePath, basename + '_' + str(num) + '.' + ext)
file_list.append(newfile)
img.crop(box).save(newfile, ext)
num = num + 1
index += 1
return file_list def compare_image_with_hash(self, image_hash1,image_hash2, max_dif=5):
"""
max_dif: 允许最大hash差值, 越小越精确,最小为0
推荐使用
"""
dif = image_hash1 - image_hash2
# print(dif)
if dif < 0:
dif = -dif
if dif <= max_dif:
return True
else:
return False # 截取验证码图片
def savePicture(self):
# self.driver.get(self.logonUrl)
# self.driver.maximize_window()
# time.sleep(2) self.driver.save_screenshot(self.codedir +"\Temp.png")
checkcode = self.driver.find_element_by_id("checkcode")
location = checkcode.location # 获取验证码x,y轴坐标
size = checkcode.size # 获取验证码的长宽
rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
int(location['y'] + size['height'])) # 写成我们需要截取的位置坐标
i = Image.open(self.codedir +"\Temp.png") # 打开截图
result = i.crop(rangle) # 使用Image的crop函数,从截图中再次截取我们需要的区域
filename = datetime.datetime.now().strftime("%M%S")
filename =self.codedir +"\Temp_code.png"
result.save(filename)
self.clearNoise(filename)
file_list = self.splitimage(self.codedir,filename)
time.sleep(3)
verycode =''
for f in file_list:
imageHash = self.get_ImageHash(f)
if imageHash:
for h, code in self.hash_code_dict.items():
flag = self.compare_image_with_hash(imageHash, h, 0)
if flag:
# print(code)
verycode += code
break print(verycode)
return verycode
# print(verycode)
# self.driver.close() def getVerycode(self, txtFile="verycode.txt"):
f = open(txtFile, 'r')
result = f.read()
return result def longon(self):
for f in range(0,10):
for l in range(1,5):
file = os.path.join(self.codedir, "codeLibrary\code" + str(f) + '_'+str(l) + ".png")
# print(file)
hash = self.get_ImageHash(file)
self.hash_code_dict[hash]= str(f) flag = True
try:
self.driver.get(self.logonUrl)
self.driver.maximize_window()
time.sleep(2)
verycode = self.savePicture()
if len(verycode)==4:
accname = self.driver.find_element_by_id("username")
# accname = self.driver.find_element_by_id("//input[@id='username']")
accname.send_keys('ctrchina') accpwd = self.driver.find_element_by_id("password")
# accpwd.send_keys('123456')
# code = self.getVerycode()
checkcode = self.driver.find_element_by_name("checkcode")
checkcode.send_keys(verycode)
submit = self.driver.find_element_by_name("button")
submit.click()
else:
flag = False
except Exception as e1:
message = str(e1.args)
flag = False
return flag # 获取版面链接及关键字
def saveUrls(self):
error = ''
while True:
flag = self.longon()
time.sleep(2)
if flag:
try:
codefault = self.driver.find_element_by_xpath("//table[@class='table_login']/tbody/tr/td/font")
if codefault:
continue
except Exception as e1:
pass
break
try:
time.sleep(2)
self.driver.get(self.firstUrl)
self.driver.maximize_window()
# urllb = "//div[@id='pageLink']/ul/div/div/a"
urllb = "//a[@id='pageLink']"
time.sleep(2)
elements = self.driver.find_elements_by_xpath(urllb)
url_layout_dict = collections.OrderedDict()
for element in elements:
layout = element.text
# print(layout)
if len(layout) == 0:
continue
# layout = txt[txt.find(":") + 1:]
link = element.get_attribute("href")
print(link)
if link not in url_layout_dict:
url_layout_dict[link] = layout
index = 0
for sub_url,layout in url_layout_dict.items():
if index==0:
sub_url=""
print(index)
self.getArticleLink(sub_url,layout)
index+=1
except Exception as e1:
print("saveUrlsException")
print("saveUrlsException:Exception" + str(e1.args)) def getArticleLink(self, url,layout):
error = ''
try:
if url:
self.driver.get(url)
self.driver.maximize_window()
time.sleep(2)
dt = datetime.datetime.now().strftime("%Y.%m.%d")
urllb = "//div[@id='titleList']/ul/li/a"
elements = self.driver.find_elements_by_xpath(urllb)
url_layout_dict = {}
for element in elements:
txt = element.text
txt = txt[txt.rfind(")") + 1:len(txt)]
if txt.find("无标题") > -1 or txt.find("公 告") > -1 or txt.find("FINANCIAL NEWS") > -1 or txt.find(dt) > -1:
continue
link = element.get_attribute("href")
print(link)
url_layout_dict[link] = layout
self.db.SavefinalUrl(url_layout_dict,self.date)
except Exception as e1:
print("getArticleLink:Exception")
print("getArticleLink:Exception" + str(e1.args))
error = e1.args def catchdata(self): rows = self.db.GetfinalUrl(self.date)
lst = []
for row in rows:
lst.append(row)
print("rowcount:"+str(len(lst)))
count =1
for row in lst:
url = row['url']
layout = row['layout']
try:
self.driver.get(url)
self.driver.maximize_window()
time.sleep(1)
title = "" # t1 = doc("div[class='text_c']")
element = self.driver.find_element_by_class_name("text_c")
title = element.find_element_by_css_selector("h3").text
st = element.find_element_by_css_selector("h1").text
if st:
title += "\n" + st
st = element.find_element_by_css_selector("h2").text
if st:
title += "\n" + st st = element.find_element_by_css_selector("h4").text
if st:
if st.find("记者") == -1:
title += "\n" + st
# else:
# author = st.replace("记者","").replace("本报","").strip()
elements = self.driver.find_elements_by_xpath("//div[@id='ozoom']/p") content = "" key = ""
index = 0
author = ''
for element in elements:
txt = element.text.strip().replace("\n", "")
content += txt
if index == 0:
if txt.find("记者") > 0 and txt.find("报道") > 0:
author = txt[txt.find("记者") + 2:txt.find("报道")]
elif txt.find("记者") > 0 and txt.find("报道") == -1:
author = txt[txt.find("记者") + 2:len(txt)]
elif txt.find("记者") == -1 and txt.find("报道") == -1:
author = txt.strip()
index += 1 for k in self.keyword_list:
if content.find(k)>-1 or title.find(k)>-1:
key+=k+","
if key:
key = key[0:len(key)-1]
author = author.replace("记者", "").strip()
if len(author)>6:
author = ""
print(count)
print(layout)
print(url)
print(title)
print(author)
count+=1
# print(content)
self.db.updatefinalUrl(url)
self.db.SavefinalData(self.date,layout,url,title,author,key,content)
except Exception as e1:
error = e1.args
self.driver.close() def export(self):
rows = self.db.GetfinalData(self.date)
lst = []
for dataRow1 in rows:
lst.append(dataRow1)
count =1
# dt = datetime.datetime.now().strftime("%Y-%m-%d")
fileName = '金融时报_' + self.date + '.csv'
header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
if len(lst)>0:
self.WriteData(header, fileName) for dataRow in lst:
date = str(dataRow['date'])
layout = str(dataRow['layout'])
url = str(dataRow['url'])
title = str(dataRow['title']).replace(",",",").replace("\n"," ")
author = str(dataRow['author']).replace(",",",")
key = str(dataRow['key']).replace(",",",")
wordcount = str(dataRow['wordcount'])
content = str(dataRow['content']).replace(",",",").replace("\n"," ") # txt = "\n%s,%s,%s,%s,%s,%s" % (
# date, key, title, author, wordcount, url)
txt = "\n%s,%s,%s,%s,%s,%s,%s,%s" % (
date, key, author, wordcount, title,layout, url, content)
try:
self.WriteData(txt, fileName)
except Exception as e1:
print(str(e1))
print(count)
count += 1 #
# dt = datetime.datetime.now().strftime("%Y-%m-%d")
# ym = datetime.datetime.now().strftime("%Y-%m")
# day = datetime.datetime.now().strftime("%d")
#
# codepath='E:/python36_crawl/mediaInfo/verycode.txt'
#
# logonUrl="http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
# # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
# firsturl="http://epaper.financialnews.com.cn/jrsb/html/"+ym+"/"+day+"/node_2.htm"
# # print(firsturl)
# keyword_list ="银保监会,央行,中国银行,中行,中银".split(",")
# exportPath="E:/News"
# codedir='E:\python36_crawl\Veriycode'
# obj = finalNews_IE(dt,logonUrl,firsturl,keyword_list,exportPath,codedir)
# # obj.saveUrls()
# obj.catchdata()
# obj.export()
# # obj.savePicture()

 采集时报2

python3 IEDriver抓取时报数据-LMLPHP

python3 IEDriver抓取时报数据-LMLPHP

python3 IEDriver抓取时报数据-LMLPHP

  layoutElement.get_attribute("onclick")

python3 IEDriver抓取时报数据-LMLPHP

  

  layoutLink = layoutElement.get_attribute("onclick")
#coding=utf-8
import os
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.common.action_chains import ActionChains
import collections
import mongoDbBase
import datetime
import numpy
from PIL import Image
import RClient
class firstfinal:
def __init__(self, strdate, firstUrl, keyword_list, exportPath,dirpath): self.db = mongoDbBase.mongoDbBase()
self.date = strdate
self.firstUrl = firstUrl
self.keyword_list = keyword_list
self.exportPath = exportPath
self.dirpath = dirpath
self.rclient = RClient.RClient() def iniDriver(self):
# 通过配置文件获取IEDriverServer.exe路径
IEDriverServer = "C:\Program Files\internet explorer\IEDriverServer.exe"
os.environ["webdriver.ie.driver"] = IEDriverServer
self.driver = webdriver.Ie(IEDriverServer) def WriteData(self, message, fileName):
fileName = os.path.join(os.getcwd(), self.exportPath + '/' + fileName)
with open(fileName, 'a') as f:
f.write(message) def getVerycode(self, txtFile="verycode.txt"):
f = open(txtFile, 'r')
result = f.read()
return result
# 点降噪 def clearNoise(self, imageFile, x=0, y=0):
if os.path.exists(imageFile):
image = Image.open(imageFile)
image = image.convert('L')
image = numpy.asarray(image)
image = (image > 135) * 255
image = Image.fromarray(image).convert('RGB')
# save_name = "D:\work\python36_crawl\Veriycode\mode_5590.png"
# image.save(save_name)
image.save(imageFile)
return image
def savePicture(self):
# self.iniDriver()
# self.driver.get(self.firstUrl)
# self.driver.maximize_window() logon = self.driver.find_element_by_xpath("//div[@class='topMenu']/div[2]/a") # 索引从1开始
# href = logon.get_attribute("href")
# self.driver.execute_script(href)
logon.click()
# self.driver.maximize_window()
time.sleep(2)
checkcode = self.driver.find_element_by_id("Verify")
temppng = "E:\python36_crawl\Veriycode\Temp.png" self.driver.save_screenshot("E:\python36_crawl\Veriycode\Temp.png")
location = checkcode.location # 获取验证码x,y轴坐标
size = checkcode.size # 获取验证码的长宽
rangle = (int(location['x']), int(location['y']), int(location['x'] + size['width']),
int(location['y'] + size['height'])) # 写成我们需要截取的位置坐标
# i = Image.open("D:\work\python36_crawl\Veriycode\code\Temp.png") # 打开截图
i = Image.open(temppng)
result = i.crop(rangle) # 使用Image的crop函数,从截图中再次截取我们需要的区域
# imagefile = datetime.datetime.now().strftime("%Y%m%d%H%M%S")+".png"
# imagefile = os.path.join("D:\work\python36_crawl\Veriycode\code",imagefile)
result.save(temppng)
# self.driver.close()
# time.sleep(2)
return temppng def longon(self): self.iniDriver()
self.driver.get(self.firstUrl)
self.driver.maximize_window() logon = self.driver.find_element_by_xpath("//div[@class='topMenu']/div[2]/a")#索引从1开始
# href = logon.get_attribute("href")
# self.driver.execute_script(href)
logon.click()
self.driver.maximize_window()
time.sleep(2)
# if os.path.exists(self.codepath):
# os.system(self.codepath) # code = self.getVerycode()
accname = self.driver.find_element_by_name("username")
# accname = self.driver.find_element_by_id("//input[@id='username']")
accname.send_keys('ctrchina') # time.sleep(15)
accpwd = self.driver.find_element_by_name("password")
# 在服务器上浏览器记录密码了,就不需要设置了
accpwd.send_keys('') checkcode = self.driver.find_element_by_name("code")
temppng = self.savePicture()
code = self.rclient.test(temppng)
checkcode.send_keys(code) submit = self.driver.find_element_by_xpath("//div[@class='UserFrom']/div[8]/button")
submit.click() time.sleep(4)
# self.driver.refresh() # 获取版面链接及关键字 def catchData(self):
flag = True
try: layoutlb = "//ul[@class='BNameList']/li/a"
artclelb = "//div[@id='SetContent']/ul/li/a"
contentlb = "//div[@id='SetContent']/ul/li/a"
layoutElements = self.driver.find_elements_by_xpath(layoutlb)
layoutCount = len(layoutElements)
layoutIndex = 0
layout = ''
# 版面循环
print("layoutCount="+str(layoutCount))
while layoutIndex<layoutCount:
if layoutIndex >0:
self.driver.get(self.firstUrl)
self.driver.maximize_window()
layoutElements = self.driver.find_elements_by_xpath(layoutlb)
layoutElement = layoutElements[layoutIndex]
layoutLink = layoutElement.get_attribute("onclick")
self.driver.execute_script(layoutLink)
else:
layoutElement = layoutElements[layoutIndex]
layout = layoutElement.text
print(layout)
articleElements = self.driver.find_elements_by_xpath(artclelb)
articleCount = len(articleElements)
print("articleCount=" + str(articleCount))
articleIndex = 0
# 每个版面中文章列表循环
while articleIndex < articleCount:
if articleIndex > 0 :
self.driver.get(self.firstUrl)
self.driver.maximize_window()
layoutElements = self.driver.find_elements_by_xpath(layoutlb)
layoutElement = layoutElements[layoutIndex]
layoutLink = layoutElement.get_attribute("onclick")
self.driver.execute_script(layoutLink) elements = self.driver.find_elements_by_xpath(contentlb)
sublink = elements[articleIndex].get_attribute("onclick") #
title = elements[articleIndex].text
print(title)
self.driver.execute_script(sublink)
author = self.driver.find_element_by_id("Setauthor").text
subE = self.driver.find_elements_by_xpath("//div[@id='SetContent']/p")
content = ''
for se in subE:
content += se.text
key = ''
for k in self.keyword_list:
if content.find(k) > -1 or title.find(k) > -1:
key += k + ","
if key:
key = key[0:len(key) - 1]
print(author)
# print(content)
print(key)
print('\n')
articleIndex += 1
self.db.SaveFirsFinalData(self.date, layout, self.firstUrl, title, author, key, content)
layoutIndex+=1 except Exception as e1:
error = e1.args
flag = True def export(self):
try:
rows = self.db.GetFirsFinalData(self.date)
lst = []
for dataRow1 in rows:
lst.append(dataRow1)
count = 1
dt = datetime.datetime.now().strftime("%Y-%m-%d")
fileName = '第一财经日报_' + self.date + '.csv'
header = "发表日期,关键字,作者,全文字数,标题,版面,链接,正文"
if len(lst)>0:
self.WriteData(header, fileName)
# 所有的文章链接都是一样的
url = 'http://buy.yicai.com/read/index/id/5.html'
for dataRow in lst:
date = str(dataRow['date'])
layout = str(dataRow['layout'])
# url = str(dataRow['url']) title = str(dataRow['title']).replace(",", ",").replace("\n", " ")
author = str(dataRow['author']).replace(",", ",")
key = str(dataRow['key']).replace(",", ",")
wordcount = str(dataRow['wordcount'])
content = str(dataRow['content']).replace(",", ",").replace("\n", " ") # txt = "\n%s,%s,%s,%s,%s,%s" % (
# date, key, title, author, wordcount, url)
txt = "\n%s,%s,%s,%s,%s,%s,%s,%s" % (
date, key, author, wordcount, title, layout, url, content)
try:
self.WriteData(txt, fileName)
except Exception as e1:
print(str(e1))
print(count)
count += 1
except Exception as e1:
error = e1.args def test(self):
dt = datetime.datetime.now().strftime("%Y-%m-%d")
# dt="2018-10-08"
dirpath = "E:\python36_crawl"
# codepath= os.path.join(dirpath,"mediaInfo\Verycode.txt")
# codepath='E:/python36_crawl/mediaInfo/verycode.txt'
# file_list = os.listdir("D:\work\python36_crawl\Veriycode\code")
# firsturl="http://buy.yicai.com/read/index/id/5.html"
firsturl = 'http://buy.yicai.com/read/index/id/5.html'
keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
exportPath = "E:/News"
obj = firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
obj.longon()
obj.catchData()
obj.export() # dt = datetime.datetime.now().strftime("%Y-%m-%d")
# # dt="2018-10-08"
# dirpath ="E:\python36_crawl"
# # codepath= os.path.join(dirpath,"mediaInfo\Verycode.txt")
# # codepath='E:/python36_crawl/mediaInfo/verycode.txt'
# # file_list = os.listdir("D:\work\python36_crawl\Veriycode\code")
# # firsturl="http://buy.yicai.com/read/index/id/5.html"
# firsturl='http://buy.yicai.com/read/index/id/5.html'
# keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
# exportPath = "E:/News"
# obj = firstfinal(dt, firsturl, keyword_list, exportPath,dirpath)
# obj.longon()
# obj.catchData()
# # while True:
# # obj.savePicture()
# obj.export()
# coding=utf-8
import datetime
import finalNews_IE
import firstfinal
import Mail
import time
import os # def WriteData(message, fileName):
# fileName = os.path.join(os.getcwd(), 'mailflag.txt')
# with open(fileName) as f:
# f.write(message)
def run():
attachmentFileDir ="E:\\News"
mailflagfile = os.path.join(os.getcwd(), 'mailflag.txt')
while True:
date = datetime.datetime.now()
strtime = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print(strtime + " 正常循环")
dt = datetime.datetime.now().strftime("%Y-%m-%d")
ym = datetime.datetime.now().strftime("%Y-%m")
day = datetime.datetime.now().strftime("%d") fileName = '金融时报_' + dt + '.csv'
fileName = os.path.join(attachmentFileDir, fileName) firstfileName = '第一财经日报_' + dt + '.csv'
firstfileName = os.path.join(attachmentFileDir, firstfileName) if not os.path.exists(fileName):
# 采集金融时报数据
logonUrl = "http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
# firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
firsturl = "http://epaper.financialnews.com.cn/jrsb/html/" + ym + "/" + day + "/node_2.htm"
# print(firsturl)
keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
exportPath = "E:/News"
codedir = 'E:\python36_crawl\Veriycode'
obj = finalNews_IE.finalNews_IE(dt, logonUrl, firsturl, keyword_list, exportPath, codedir)
obj.saveUrls()
obj.catchdata()
obj.export()
if not os.path.exists(firstfileName):
# 采集第一采集日报数据
dirpath = "E:\python36_crawl"
firsturl = 'http://buy.yicai.com/read/index/id/5.html'
keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
exportPath = "E:/News"
obj = firstfinal.firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
obj.longon()
obj.catchData()
obj.export()
if date.strftime('%H:%M')=="08:50":
# 发送邮件
obj = Mail.Mail()
obj.test()
# WriteData(dt,mailflagfile)
time.sleep(100)
else:
time.sleep(10) run() # try: # dt = datetime.datetime.now().strftime("%Y-%m-%d")
# ym = datetime.datetime.now().strftime("%Y-%m")
# day = datetime.datetime.now().strftime("%d")
# # 采集金融时报数据
# logonUrl = "http://epaper.financialnews.com.cn/dnis/client/jrsb/index.jsp"
# # firsturl="http://epaper.financialnews.com.cn/jrsb/html/2018-09/18/node_2.htm"
# firsturl = "http://epaper.financialnews.com.cn/jrsb/html/" + ym + "/" + day + "/node_2.htm"
# # print(firsturl)
# keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
# exportPath = "E:/News"
# codedir = 'E:\python36_crawl\Veriycode'
# obj = finalNews_IE.finalNews_IE(dt, logonUrl, firsturl, keyword_list, exportPath, codedir)
# obj.saveUrls()
# obj.catchdata()
# obj.export()
#
# # 采集第一采集日报数据
# dirpath = "E:\python36_crawl"
# firsturl = 'http://buy.yicai.com/read/index/id/5.html'
# keyword_list = "银保监会,央行,中国银行,中行,中银".split(",")
# exportPath = "E:/News"
# obj = firstfinal.firstfinal(dt, firsturl, keyword_list, exportPath, dirpath)
# obj.longon()
# obj.catchData()
# obj.export() # 发送邮件
# obj = Mail.Mail()
# obj.test()
# except Exception as e1:
# print(str(e1))
05-11 09:17