'''本次爬取讲历史网站'''#!usr/bin/env python#-*- coding:utf-8 _*-"""@author:Hurrican@file: 分页爬取数据.py@time: 2018/11/03 9:30
"""from bs4 import BeautifulSoupimport requests
def get_urls(): urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)] urls.insert(0, 'http://www.jianglishi.cn/jiemi/') return urls
def get_title(): for a1 in get_urls(): web_data = requests.get(a1)
web_data.encoding = 'utf-8' web = web_data.text soup = BeautifulSoup(web, 'html5lib') soup1 = soup.findAll(name='div', attrs={'class': 'title'}) for piece in soup1: title = piece.a.string print(title)
if __name__ == '__main__': get_title()运行结果:
方法2:
#!usr/bin/env python#-*- coding:utf-8 _*-"""@author:lenovo@file: spider_urllib.py@time: 2018/11/07 14:31
"""import urllib.requestfrom bs4 import BeautifulSoup'''python3中unicode 与 bytes 相互转化 str类型转化为bytes类型,使用encode()内置函数;反过来,使用decode()函数'''def get_content():
urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)] urls.insert(0, 'http://www.jianglishi.cn/jiemi/') for url in urls: html = urllib.request.urlopen(url) content = html.read() content = content.decode() html.close()
osoup = BeautifulSoup(content,'html5lib') all_title = osoup.find_all('div',class_="title") # print(all_images) for title in all_title: print(title.a.string)
get_content()
衍生阅读:
我们来继续爬取图片
#!usr/bin/env python# -*- coding: utf-8 -*-"""@author:lenovo@file: spider_urllib.py@time: 2018/11/07 14:31
"""import urllib.requestfrom bs4 import BeautifulSoup
'''python3中unicode 与 bytes 相互转化 str类型转化为bytes类型,使用encode()内置函数;反过来,使用decode()函数'''def get_urls(): urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)] urls.insert(0, 'http://www.jianglishi.cn/jiemi/') return urls
def get_content():
# urls = ['http://www.jianglishi.cn/jiemi/page_{}.html'.format(str(i)) for i in range(2, 21)] # urls.insert(0, 'http://www.jianglishi.cn/jiemi/') x= 1 for url in get_urls(): html = urllib.request.urlopen(url) content = html.read() content = content.decode('utf-8') html.close()
osoup = BeautifulSoup(content,'html5lib') all_images = osoup.find_all('img',onerror="this.src='/statics/statics/img/nopic.gif';this.onerror=null;") print(all_images)
for img in all_images: #print(img['src']) dow_img = img['src'].encode('utf-8').decode('utf-8') g = dow_img.replace('。','.') #此处我们发现有一个链接出现中文的句号,需要将其处理成标准的url # print(g) s = urllib.request.urlretrieve(g,r'H:\py\image\\%s.jpg'%x) print("正在下载%s"%dow_img) x+=1 print("下载完成")
get_content()'''<img src="http://cimg2。163.com/cnews/2006/9/25/20060925163612ab80e.jpg" alt="三峡蓄水奉节老城全淹 “刘备疑冢”永沉长江底" onerror="this.src='/statics/statics/img/nopic.gif';this.onerror=null;">'''
上面这种情况直接是要用try catch 就能避免