昨天那个小说网站真坑,中间章节几章就缺点,今天换了个网站爬,不知道怎么昨天的代码就不行utf-8解码问题,貌似是utf-8编码不能再utf-8解码。 反正改了一下代码还是能用。p3[],p4[]字符串数组,就是前面注释掉的那些语句先从网站上爬下来手工整理的。
1 #coding utf-8 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 import re 6 7 def get_html(url): 8 page = urllib.request.urlopen(url) 9 html = page.read() 10 # print(bytes.decode(html)) 11 return html 12 13 ''' 14 page='https://www.i7wx.com/book/14/14933/' 15 #p1 = BeautifulSoup(get_html(page).decode('utf-8'), 'html.parser') 16 p1 = BeautifulSoup(get_html(page), 'html.parser') 17 #print(p1) 18 p2=[] 19 for p in p1.find_all('a',): 20 print(p) 21 if "href" in str(p): 22 p2.append(p['href']) 23 print(p2) 24 25 ''' 26 #p3=['/book/36273/31737154.html', '/book/36273/31737155.html', '/book/36273/31737156.html', '/book/36273/31737157.html', '/book/36273/31737158.html', '/book/36273/31737159.html', '/book/36273/31737160.html', '/book/36273/31737161.html', '/book/36273/31737162.html', '/book/36273/31737163.html', '/book/36273/31737164.html', '/book/36273/31737165.html', '/book/36273/31737166.html', '/book/36273/31737167.html', '/book/36273/31737168.html', '/book/36273/31737169.html', '/book/36273/31737170.html', '/book/36273/31863549.html', '/book/36273/32060318.html', '/book/36273/32060319.html', '/book/36273/32060320.html', '/book/36273/32157836.html', '/book/36273/32675620.html', '/book/36273/32693741.html', '/book/36273/32705629.html', '/book/36273/32720993.html', '/book/36273/32720995.html', '/book/36273/32751825.html', '/book/36273/32969531.html', '/book/36273/32969532.html', '/book/36273/32969533.html', '/book/36273/32969534.html', '/book/36273/32969535.html', '/book/36273/32969536.html', '/book/36273/32969537.html', '/book/36273/32969538.html', '/book/36273/32969539.html', '/book/36273/32969540.html', '/book/36273/32969541.html', '/book/36273/33178998.html', '/book/36273/33179002.html', '/book/36273/33179005.html', '/book/36273/33179008.html', '/book/36273/33415818.html', '/book/36273/33434196.html', '/book/36273/35213931.html', '/book/36273/35213932.html', '/book/36273/35213933.html', '/book/36273/35213934.html', '/book/36273/35213935.html', '/book/36273/35213936.html', '/book/36273/35213937.html', '/book/36273/35213938.html', '/book/36273/35213939.html', '/book/36273/35213940.html', '/book/36273/35213941.html', '/book/36273/35213942.html', '/book/36273/35213943.html', '/book/36273/35262823.html', '/book/36273/35318036.html', '/book/36273/35318037.html', '/book/36273/35362277.html', '/book/36273/35390213.html', '/book/36273/35397646.html', '/book/36273/35398640.html', '/book/36273/35410795.html', '/book/36273/35418366.html', '/book/36273/35454975.html', '/book/36273/35455295.html', '/book/36273/35456452.html', '/book/36273/35458123.html', '/book/36273/35488936.html', '/book/36273/35488937.html', '/book/36273/35495130.html', '/book/36273/35498675.html', '/book/36273/35503958.html', '/book/36273/35510595.html', '/book/36273/35510628.html', '/book/36273/35517338.html', '/book/36273/35522119.html', '/book/36273/35529846.html', '/book/36273/35536421.html', '/book/36273/35590637.html', '/book/36273/35590638.html', '/book/36273/35601859.html', '/book/36273/35657475.html', '/book/36273/35662329.html', '/book/36273/35675638.html', '/book/36273/35693345.html', '/book/36273/35693346.html', '/book/36273/35735160.html', '/book/36273/35740864.html', '/book/36273/35750550.html', '/book/36273/35754379.html', '/book/36273/35786823.html'] 27 p4=['29497614.html', '29521741.html', '29553661.html', '29558911.html', '29570352.html', '29591242.html', '29591243.html', '29592356.html', '29607245.html', '29639200.html', '29683266.html', '29684993.html', '29688180.html', '29699659.html', '29699660.html', '29703952.html', '29754475.html', '29770381.html', '29781210.html', '29781211.html', '29781212.html', '29781213.html', '29783803.html', '29791587.html', '29798479.html', '29842060.html', '29856708.html', '29876792.html', '29881350.html', '29903213.html', '29915434.html', '29915435.html', '29934455.html', '29938272.html', '29940052.html', '29951592.html', '29959942.html', '29963651.html', '29976491.html', '29981650.html', '29984975.html', '29996708.html', '30007939.html', '30031650.html', '30047043.html', '30065849.html', '30081303.html', '30102770.html', '30128945.html', '30146213.html', '30146456.html', '30154506.html', '30172168.html', '30180717.html', '30208912.html', '30208914.html', '30222437.html', '30238855.html', '30246629.html', '30304265.html', '30334083.html', '30345080.html', '30348020.html', '30360117.html', '30368006.html', '30393530.html', '30408984.html', '30414503.html', '30416144.html', '30441267.html', '30441268.html', '30454974.html', '30460811.html', '30471801.html', '30482304.html', '30490880.html', '30500853.html', '30507451.html', '30514975.html', '30519157.html', '30585514.html', '30585515.html', '30585516.html', '30625914.html', '30631592.html', '30645907.html', '30688571.html', '30688572.html', '30755591.html', '30772449.html', '30781034.html', '30784347.html', '30849171.html'] 28 url='https://www.xuehong.cc/book/36273/31737154.html' 29 i=0 30 for num in p4: 31 urlNum='https://www.i7wx.com/book/14/14933/'+p4[i] 32 33 # soup = BeautifulSoup(get_html(urlNum).decode('utf-8'), 'html.parser') 34 soup = BeautifulSoup(get_html(urlNum), 'html.parser') 35 for j in soup.find_all('h1',): 36 37 j1=str(j).replace("<h1>","") 38 print(j1) 39 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象 40 f.write(str(j1)+"\n\n") 41 42 for k in soup.find_all('div', id='content'): 43 k1=str(k).replace(" ","") 44 k2=k1.replace("<br/><br/>","\n\n") 45 print(k2) 46 with open('F:\\book.txt', 'a',encoding='utf-8') as f: # 设置文件对象 47 f.write(k2+"\n\n\n\n") 48 i=i+1