目的:

使用requests库以及xpath解析进行实验楼所有课程,存入MySQL数据 库中。

准备工作:

首先安装,requests库,lxml库,以及peewee库。在命令行模式,使用以下命令。

pip install requests
pip install lxml
pip install peewee

  然后,就可以打开编辑器编写代码了。

代码:

 # 引入所需要的库
import time
import requests
from peewee import *
from lxml import etree
# 这个程序要倒着看 # 这个是连接数据库的,名字和密码根据自己情况修改
db = MySQLDatabase('shiyanlou', user='root', passwd='xxxxxx') class Course(Model):
title = CharField()
teacher = CharField()
teacher_courses = IntegerField()
tag = CharField()
study_num = IntegerField()
content = CharField() class Meta:
database = db Course.create_table() def parse_content(url, title, tag, study_num):
print('课程地址:' + url)
res = requests.get(url)
xml = etree.HTML(res.text)
# 获取页面里的简介
try:
content = xml.xpath('//meta[@name="description"]/@content')[0]
except Exception as e:
content = '无'
# 获取老师名字
try:
teacher = xml.xpath(
'//div[@class="sidebox mooc-teacher"]//div[@class="mooc-info"]/div[@class="name"]/strong/text()')[0]
except Exception as e:
teacher = '匿名'
# 获取老师发表课程数目
try:
teacher_courses = xml.xpath(
'//div[@class="sidebox mooc-teacher"]//div[@class="mooc-info"]/div[@class="courses"]/strong/text()')[0]
except Exception as e:
teacher_courses = '未知'
# 存入数据库
try:
course = Course(title=title, teacher=teacher,
teacher_courses=int(teacher_courses), tag=tag, study_num=int(study_num), content=content)
course.save()
except Exception as e:
print('一条数据存取失败') def get_course_link(url):
# 获取每一页的信息,传给下一个函数
response = requests.get(url)
xml = etree.HTML(response.text)
# contains()是包含的意思
courses = xml.xpath(
'//div[contains(@class, "col-md-3") and contains(@class, "col-sm-6") and contains(@class, "course")]')
for course in courses:
try:
url = 'https://www.shiyanlou.com' + course.xpath('.//a/@href')[0]
except Exception as e:
print('一个课程页面未获得')
continue
title = course.xpath('.//div[@class="course-name"]/text()')[0]
study_people = course.xpath(
'.//span[@class="course-per-num pull-left"]/text()')[1].strip()
# study_people = int(study_people)
try:
tag = course.xpath(
'.//span[@class="course-money pull-right"]/text()')[0]
except Exception as e:
tag = "普通"
parse_content(url=url, title=title, tag=tag, study_num=study_people)
# time.sleep(0.5) def main():
# 通过requests库的get获得目标地址的返回信息,类型为Response
response = requests.get('https://www.shiyanlou.com/courses/')
# 将返回信息的文本转化为xml树,可以通过xpath来进行查询
xml = etree.HTML(response.text)
# 由分析网页源代码可以总结,url分页模式,只有最后的数字不一样
course_link = 'https://www.shiyanlou.com/courses/?category=all&course_type=all&fee=all&tag=all&page={}'
# 这里获得最大页数就可以了,xpath()函数里的便是寻找路径了
# //会在全文来进行查找,//ul则是查找全文的ul标签,//ul[@class="pagination"]会仅查找有class属性,
# 且为"pagination"的标签,之后/li是查找当前的ul标签下的li标签(仅取一层),取查询到的列表倒数第二个标签
# 为li[last()-1],/a/text()查询a标签里的文本内容
page = xml.xpath('//ul[@class="pagination"]/li[last()-1]/a/text()')
if len(page) != 1:
print('爬取最大页数时发生错误!!')
return None
# page原是一个列表,这里取出它的元素,并转化为Int型
page = int(page[0])
# 将每一页的url传给get_course_link函数进行处理
for i in range(1, page + 1):
# 填入course_link,获取完整url
url = course_link.format(i)
print('页面地址:' + url)
# 调用另一个函数
get_course_link(url) if __name__ == '__main__':
# 调用main函数
main() # [Finished in 218.5s]
05-11 17:01