import requests #import bs4 from bs4 import BeautifulSoup #from bs4 import BeautifulStoneSoup def getHTML(url): r = requests.get(url, stream=True) r.encoding = 'gbk2312' #print(r) # print(r.text) return r.content def parseHTML(html): soup = BeautifulSoup(html, fromEncoding="gb18030") body = soup.table.prettify() tables = soup.findAll('table') tab = tables[0] for tr in tab.findAll('tr'): for td in tr.findAll('td'): print (td.getText()), URL = 'http://shuju.3156.cn/gcyp/index-page-' #URL="http://app1.sfda.gov.cn/datasearchcnda/face3/base.jsp?tableId=25&tableName=TABLE25&title=%B9%FA%B2%FA%D2%A9%C6%B7&bcId=152904713761213296322795806604" for num in list(range(1,1000)): html = getHTML(URL+ str(num)) print("第"+str(num)+"页") parseHTML(html) #html = getHTML(URL) print(html)
最近再演就python 初步尝试爬虫的问题
抓取页面地址:http://shuju.3156.cn/gcyp/index-page-1
主要包括赝品名称 生产企业,产品类别 等内容 以上程序已经完成运行