抓取网页: urllib.request.urlopen(url).read().decode('utf-8') --- (百度是utf-8,谷歌不是utf-8,也不是cp936,ascii也不行,iso-8859-1勉强)
PyQt4.QtGui的QTextEdit控件自动解析HTML文档,为显示纯HTML文件,应使用方法QTextEdit.setPlainText()
例:抓取网页的标题、图片和链接
import sys, re import urllib.request from PyQt4 import QtGui class MainWindow(QtGui.QWidget): def __init__(self): super(MainWindow, self).__init__() self.setWindowTitle('Crawl') self.resize(485, 300) self.txt = QtGui.QTextEdit() # self.txt.setF self.txt.setReadOnly(True) grid = QtGui.QGridLayout() grid.addWidget(self.txt) self.setLayout(grid) url = 'http://www.baidu.com/s?wd=python' page = urllib.request.urlopen(url).read().decode('utf-8') fp = open('e:/temp.txt', 'wt', encoding='utf-8') fp.write(page) fp.close s = '标题:\n' page_title = re.compile('<title>(.+?)</title>') s += page_title.findall(page)[0]+'\n' s += '图片:\n' page_images = re.compile('<img src="(.+?)"') for data in page_images.findall(page): s += data+'\n' s += '链接:\n' page_link = re.compile('href="(.+?)"') for data in page_link.findall(page): s += data+'\n' self.txt.setPlainText(s) app = QtGui.QApplication(sys.argv) mainwindow = MainWindow() mainwindow.show() app.exec_()