网上没新的txt下载,自己想办法下载来看,可以根据网页标签不同来修改。 还没入门,不智能慢慢研究吧。
bs4要装一下,pip install BeautifulSoup4
1 #coding utf-8 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 import re 6 7 def get_html(url): 8 page = urllib.request.urlopen(url) 9 html = page.read() 10 # print(bytes.decode(html)) 11 return html 12 13 ''' 14 page='https://www.xuehong.cc/book/36273/' 15 p1 = BeautifulSoup(get_html(page).decode('utf-8'), 'html.parser') 16 p2=[] 17 for p in p1.find_all('a',): 18 p2.append(p['href']) 19 print(p2) 20 ''' 21 22 p3=['/book/36273/31737154.html', '/book/36273/31737155.html', '/book/36273/31737156.html', '/book/36273/31737157.html', '/book/36273/31737158.html', '/book/36273/31737159.html', '/book/36273/31737160.html', '/book/36273/31737161.html', '/book/36273/31737162.html', '/book/36273/31737163.html', '/book/36273/31737164.html', '/book/36273/31737165.html', '/book/36273/31737166.html', '/book/36273/31737167.html', '/book/36273/31737168.html', '/book/36273/31737169.html', '/book/36273/31737170.html', '/book/36273/31863549.html', '/book/36273/32060318.html', '/book/36273/32060319.html', '/book/36273/32060320.html', '/book/36273/32157836.html', '/book/36273/32675620.html', '/book/36273/32693741.html', '/book/36273/32705629.html', '/book/36273/32720993.html', '/book/36273/32720995.html', '/book/36273/32751825.html', '/book/36273/32969531.html', '/book/36273/32969532.html', '/book/36273/32969533.html', '/book/36273/32969534.html', '/book/36273/32969535.html', '/book/36273/32969536.html', '/book/36273/32969537.html', '/book/36273/32969538.html', '/book/36273/32969539.html', '/book/36273/32969540.html', '/book/36273/32969541.html', '/book/36273/33178998.html', '/book/36273/33179002.html', '/book/36273/33179005.html', '/book/36273/33179008.html', '/book/36273/33415818.html', '/book/36273/33434196.html', '/book/36273/35213931.html', '/book/36273/35213932.html', '/book/36273/35213933.html', '/book/36273/35213934.html', '/book/36273/35213935.html', '/book/36273/35213936.html', '/book/36273/35213937.html', '/book/36273/35213938.html', '/book/36273/35213939.html', '/book/36273/35213940.html', '/book/36273/35213941.html', '/book/36273/35213942.html', '/book/36273/35213943.html', '/book/36273/35262823.html', '/book/36273/35318036.html', '/book/36273/35318037.html', '/book/36273/35362277.html', '/book/36273/35390213.html', '/book/36273/35397646.html', '/book/36273/35398640.html', '/book/36273/35410795.html', '/book/36273/35418366.html', '/book/36273/35454975.html', '/book/36273/35455295.html', '/book/36273/35456452.html', '/book/36273/35458123.html', '/book/36273/35488936.html', '/book/36273/35488937.html', '/book/36273/35495130.html', '/book/36273/35498675.html', '/book/36273/35503958.html', '/book/36273/35510595.html', '/book/36273/35510628.html', '/book/36273/35517338.html', '/book/36273/35522119.html', '/book/36273/35529846.html', '/book/36273/35536421.html', '/book/36273/35590637.html', '/book/36273/35590638.html', '/book/36273/35601859.html', '/book/36273/35657475.html', '/book/36273/35662329.html', '/book/36273/35675638.html', '/book/36273/35693345.html', '/book/36273/35693346.html', '/book/36273/35735160.html', '/book/36273/35740864.html', '/book/36273/35750550.html', '/book/36273/35754379.html', '/book/36273/35786823.html'] 23 24 url='https://www.xuehong.cc/book/36273/31737154.html' 25 i=0 26 for num in p3: 27 urlNum='https://www.xuehong.cc'+p3[i] 28 29 soup = BeautifulSoup(get_html(urlNum).decode('utf-8'), 'html.parser') 30 for j in soup.find_all('h1',): 31 print(j) 32 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象 33 f.write(str(j)+"\n\n") 34 35 for k in soup.find_all('div', id='content'): 36 k1=str(k).replace(" ","") 37 k2=k1.replace("<br/><br/>","\n\n") 38 print(k2) 39 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象 40 f.write(k2+"\n\n\n\n")
41 i=i+1