#python3.7
'''
功能:实现www.biqukan.com/1_1094/5403177.html小说下载为txt
v1.0
'''
import requests,sys,time
from lxml import etree
##0.获取所有章节url
def get_url_list(catalog_url):
res=requests.get(catalog_url)
#time.sleep(1)
if res.status_code==200:
text=res.text
html=etree.HTML(text)
hrefs=html.xpath('//div[@class="listmain"]/dl/dt[2]/following-sibling::*/a/@href')
#print('hrefs')
return hrefs
return None ##1.获取页面
def get_one_page(url):
res=requests.get(url)
#time.sleep(1)
if res.status_code==200:
return res.text
return None ##2.解析页面
def parse_one_page(text):
html=etree.HTML(text)
title=html.xpath('//div[@class="content"]/h1//text()')
content=html.xpath('//div[@class="showtxt"]//text()') #去掉换行
contents=''.join(content).replace('\xa0'*8,'\n'*2)#把列表转换为一整段文本,并把8个空格换为2个换行
#print(title,contents) #print(title)
return title,contents #返回多个参数,相当于返回一个元组return(title,content) ##3.保存内面
def write_to_file(title,contents):
with open('一念永恒.txt','a',encoding='utf-8')as f:
f.write(title[0]+'\n'+contents+'\n') #for content in contents:
# f.write(content) ##主函数
def main():
#0.获取章节列表的网址
catalog_url='https://www.biqukan.com/1_1094/'
urls=get_url_list(catalog_url)
#print(urls)
#1.把网址传入详情抓取页面,并保存
for i in range(len(urls)):
rel_url='https://www.biqukan.com'+urls[i]
print(rel_url)
text=get_one_page(rel_url)
#接收2步return的多个参数写法a,b=(x,y)
title,contents=parse_one_page(text)
#print(title,contents)
#写入txt
write_to_file(title[0],contents)
#显示下载进度
sys.stdout.write(" 已下载:%.3f%%" % float(i/len(urls)) + '\r')
sys.stdout.flush() ##执行入口
if __name__=='__main__':
main()
05-11 09:32