python - 使用PyQt/QtWebkit进行Web爬取多个链接

我正试图创建一个大型的政府记录网站，它需要一个“滚雪球”的方法，即从主搜索页面开始，然后跟踪scraper找到的每个链接到下一个页面。
我已经能够使用PyQtthis SiteScraper tutorial加载主页了。

import sys
from PySide.QtGui import *
from PySide.QtCore import *
from PySide.QtWebKit import *
from BeautifulSoup import BeautifulSoup

class Render(QWebPage):
      def __init__(self, url):
           self.app = QApplication(sys.argv)
           QWebPage.__init__(self)
           self.loadFinished.connect(self._loadFinished)
           self.mainFrame().load(QUrl(url))
           self.app.exec_()

      def _loadFinished(self, result):
           self.frame = self.mainFrame()
           self.app.quit()

def main():
    baseUrl = 'http://www.thesite.gov'
    url = 'http://www.thesite.gov/search'
    r = Render(url)
    html = r.frame.toHtml()
    # use BeautifulSoup to cycle through each regulation
    soup = BeautifulSoup(html)

regs = soup.find('div',{'class':'x-grid3-body'}).findAll('a')

# cycle through list and call up each page separately
for reg in regs:
    link = baseUrl + reg['href']
    link = str(link)
    # use Qt to load each regulation page
    r = Render(link)

    html = r.frame.toHtml() # get actual rendered web page

问题是，当我尝试呈现新网页时，出现此错误：

RuntimeError: A QApplication instance already exists.

我知道函数正试图调用另一个QApplication实例。但是，如何导航到具有相同实例的新页面？

class Render(QWebPage):
     def __init__(self, app, url):
          QWebPage.__init__(self)
          self.loadFinished.connect(self._loadFinished)
          self.mainFrame().load(QUrl(url))
          app.exec_()

     def _loadFinished(self, result):
          self.frame = self.mainFrame()

def main():
    app = QApplication(sys.argv)

    baseUrl = 'http://www.thesite.gov'
    url = 'http://www.thesite.gov/search'

    r = Render(app, url)
    html = r.frame.toHtml()

最佳答案

那好吧。。？使用simplejson或json可能会更容易些）答案是不要生成多个QApplication。不允许你这么做。使main成为QApplication并使用QWebPage而不必费心呼叫QApplication.exec_()。如果不起作用，请在另一个QThread中运行它。