抓取csdn论坛

实现功能

  1. 获取论坛分类所有链接,并拼接成推荐精华页的完成的链接

  2. 获取推荐精华页的帖子状态,赏分,帖子标题,作者,发布时间,回复量,查看量,最后发表时间

  3. 置顶内容不爬取,只打印置顶内容的帖子标题,作者信息和帖子链接

  4. 跳过没有内容的版块,并打印出来

  5. 把获取到的内容添加到数据库

未实现功能

  1. 获取非技术论坛时遇到一个讨论帖子,前面没有帖子状态,出现index下标异常

  2. 未爬取推荐精华页的所有帖子(只实现了第一页的爬取)

  3. 未爬取帖子内容(帖子发布的内容和回复信息等等)

  4. 未爬取发帖人的个人信息(排名,发帖数,回帖数,结帖率等等)

总结:

  1. python的很多基础方法不知道

  2. 字符串操作不熟练

  3. xpath语法不熟练

  4. peewee很多方法不知道

  5. 马虎,判断帖子有没有内容时,后面忘加一个方法导致运行错误,找了半个多小时才找到

  6. 未实现功能应该都能解决,只是嫌麻烦

"""
抓取
解析
存储
"""
import re
import ast
from urllib import parse
import requests
from scrapy import Selector
import json
import time
from csdn_spider.models import * domain = 'https://bbs.csdn.net'
def get_nodes_json():
left_menu_text = requests.get('https://bbs.csdn.net/dynamic_js/left_menu.js?csdn').text
# print(left_menu_text)
nodes_str_match = re.search('forumNodes: (.*])',left_menu_text)
if nodes_str_match:
nodes_str = nodes_str_match.group(1).replace('null','None')
nodes_list = ast.literal_eval(nodes_str)
# print(nodes_list)
return nodes_list
return [] url_list = []
def process_nodes_list(nodes_list):
#将js的格式提取出url转换到list中
for item in nodes_list:
if 'url' in item:
if item['url']:
url_list.append(item['url'])
if 'children' in item:
process_nodes_list(item['children']) def get_levell_list(nodes_list):
levell_url = []
for item in nodes_list:
if 'url' in item and item['url']:
levell_url.append(item['url'])
return levell_url def get_last_list():
#获取最终需要抓取的url
nodes_list = get_nodes_json()
process_nodes_list(nodes_list)
levell_url = get_levell_list(nodes_list)
last_url = []
for url in url_list:
if url not in levell_url:
last_url.append(url)
all_urls = []
for url in last_url:
all_urls.append(parse.urljoin(domain, url+'/recommend'))
return all_urls def parse_list(url):
res_text = requests.get(url).text
sel = Selector(text=res_text)
all_sel = sel.xpath('//table[@class="forums_tab_table"]/tbody//tr')
if len(all_sel.extract()) != 0:
if str(re.search('没有帖子', all_sel.extract()[0])) != 'None':
print('没有帖子')
return
for tr in all_sel:
if (tr.xpath('td[@class="forums_topic"]/span[1]/text()').extract()) == ['[置顶]']:
print('发现置顶!!!')
print('置顶账号为:',tr.xpath('td[@class="forums_author"]/a/text()').extract()[0])
print('置顶内容为:',tr.xpath('td[@class="forums_topic"]/a[2]/text()').extract()[0])
print('置顶链接为:',parse.urljoin(domain,tr.xpath('td[@class="forums_topic"]/a[2]/@href').extract()[0]))
print('###############')
else:
#帖子状态
status = tr.xpath('td[@class="forums_topic_flag"]/span/text()').extract()[0]
# print('帖子状态', status)
#赏分
score = tr.xpath('td[@class="forums_score"]/em/text()').extract()[0]
# print('赏分', score)
#标题链接
topic_url = parse.urljoin(domain,tr.xpath('td[@class="forums_topic"]/a/@href').extract()[0])
# print('标题链接', topic_url)
#标题
topic_title = tr.xpath('td[@class="forums_topic"]/a/text()').extract()[0]
# print('标题', topic_title)
#标题id
topic_id = topic_url.split('/')[-1]
# print('标题id', topic_id)
#作者链接
id_url = tr.xpath('td[@class="forums_author"]/a/@href').extract()[0]
# print('作者链接', id_url)
#作者id
author_id = id_url.split('/')[-1]
# print('作者id', author_id)
#作者名称
author_name = tr.xpath('td[@class="forums_author"]/a/text()').extract()[0]
# print('作者名称', author_name)
#发布时间
create_time = datetime.strptime(tr.xpath('td[@class="forums_author"]/em/text()').extract()[0], '%Y-%m-%d %H:%M')
# print('发布时间', create_time)
#回复和查看的字符串
answer_info = (tr.xpath('td[@class="forums_reply"]/span/text()').extract()[0]).split('/')
#回复数量
answer_nums = answer_info[0]
# print('回复数量', answer_nums)
#查看数量
click_nums = answer_info[-1]
# print('查看数量', click_nums)
#最后发表时间
last_time = datetime.strptime(tr.xpath('td[@class="forums_last_pub"]/em/text()').extract()[0], '%Y-%m-%d %H:%M')
# print('最后发表时间', last_time)
#添加到数据库
Topic.create(id=topic_id,status=status,score=score,title_url=topic_url,title=topic_title,author_id=author_id,author_name=author_name,create_time=create_time,answer_nums=answer_nums,click_nums=click_nums,last_answer_time=last_time) if __name__=='__main__':
all_urls = get_last_list()
for url in all_urls:
print('正在连接:',url)
parse_list(url)
time.sleep(3)
from peewee import *#建立数据库连接
db = MySQLDatabase('spider', host='127.0.0.1', port=3306, user='root', password='123456lmr')
# class BaseModel(Model):
class Meta:
database = db '''
设计数据表的时候需要注意的点
char类型,尽量设置MAX(最大长度)
对于无法确定最大长度的,要采用TextField类型
default和null=True
主键无法设置int以外的类型(可能是版本问题)
''' #帖子list
class Topic(BaseModel):
#帖子名称
title = CharField()
#帖子链接
title_url = CharField(default='')
# #帖子内容
# content = TextField(default='')
#帖子id
id = IntegerField(primary_key=True)
#用户id
author_id = CharField()
#用户名称
author_name = CharField()
#创建时间
create_time = DateTimeField()
#回复数量
answer_nums = IntegerField(default=0)
#查看数量
click_nums = IntegerField(default=0)
# #点赞数量
# parised_nums = IntegerField(default=0)
# #结帖率
# jtl = FloatField(default=0.0)
#赏分
score = IntegerField(default=0)
#状态
status = CharField()
#最后回复时间
last_answer_time = DateTimeField() #帖子内容
class Answer(BaseModel):
#
topic_id = IntegerField()
author = CharField()
content = TextField(default="")
create_time = DateTimeField()
parised_nums = IntegerField(default=0) #点赞数 #用户
class Author(BaseModel):
name = CharField()
sign_name_id = CharField()
# id = CharField(primary_key=True)
click_nums = IntegerField(default=0) # 访问数
original_nums = IntegerField(default=0) # 原创数
forward_nums = IntegerField(default=0) # 转发数
rate = CharField(default=-1) # 排名
answer_nums = IntegerField(default=0) # 评论数
parised_nums = IntegerField(default=0) # 获赞数
desc = TextField(null=True)
industry = CharField(null=True)
location = CharField(null=True)
follower_nums = IntegerField(default=0) # 粉丝数
following_nums = IntegerField(default=0) # 关注数 if __name__ == '__main__':
# db.create_tables([Topic])
# db.create_tables([Answer])
# db.create_tables([Author])
db.create_tables([Topic, Answer, Author])
05-15 02:33