pc版大概有500条记录,mobile大概是50部,只有热门的,所以少一点
url构造很简单,主要参数就是page_limit与page_start,每翻一页,start+=20即可,tag是"美剧"编码后的结果,直接带着也可以,用unquote解码也可以,注意headers中一定要带上refer
import json
import requests
import math
import os
import shutil
from pprint import pprint
from urllib import parse class DoubanSpliderPC:
def __init__(self):
self.url = parse.unquote(
"https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%BE%8E%E5%89%A7&sort=recommend&page_limit=20&page_start={}") self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Referer": "https://movie.douban.com/tv/"
}
self.file_dir = "./douban_american_pc.txt" def parse_url(self):
number = 0
while True:
url = self.url.format(number)
print(url)
response = requests.get(url, headers=self.headers)
response_dict = json.loads(response.content.decode())
subjects_list = response_dict["subjects"]
with open(self.file_dir, "a", encoding="utf-8") as file:
for subject in subjects_list:
file.write(json.dumps(subject, ensure_ascii=False))
file.write("\r\n")
if len(subjects_list) < 20:
break
number += 20 def run(self):
# 删除之前保存的数据
if os.path.exists(self.file_dir):
os.remove(self.file_dir)
print("文件已清空")
self.parse_url() def main():
splider = DoubanSpliderPC()
splider.run() if __name__ == '__main__':
main()
moblie类似,不过抓包的时候找那个Item就可以了
import json
import requests
import math
import os
import shutil
from pprint import pprint # 爬取豆瓣的美剧页面(手机版只有50条)
class DouBanSpliderMobile:
pageCount = 18
total = None def __init__(self):
self.first_url = "https://m.douban.com/rexxar/api/v2/subject_collection/tv_american/items?os=ios&for_mobile=1&start={}&count=18&loc_id=108288&_=1552995446961"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
"Referer": "https://m.douban.com/tv/american"
}
self.file_dir = "./douban_american_mobile.txt" def get_url_list(self):
url_list = []
for i in range(math.ceil(DouBanSpliderMobile.total / DouBanSpliderMobile.pageCount)):
url = self.first_url.format(i * 18)
url_list.append(url)
return url_list def parse_url(self, url):
response = requests.get(url, headers=self.headers)
response_dict = json.loads(response.content.decode())
DouBanSpliderMobile.total = int(response_dict["total"])
with open(self.file_dir, "a", encoding="utf-8") as file:
json.dump(response_dict["subject_collection_items"], file, ensure_ascii=False, indent=2) def run(self):
# 解析第一个url,获取total
self.parse_url(self.first_url.format(0))
url_list = self.get_url_list() # 删除之前保存的文件
if os.path.exists(self.file_dir):
os.remove(self.file_dir) for url in url_list:
self.parse_url(url) def main():
douban_splider = DouBanSpliderMobile()
douban_splider.run() if __name__ == '__main__':
main()