Code below works fine, beside one thing: Desired output prints few times =)
#! /usr/bin/env python2.7
from PyQt4.QtCore import *
from PyQt4.QtGui import *
from PyQt4.QtWebKit import *
from bs4 import BeautifulSoup as bs
from random import choice
from urllib import urlretrieve
from urllib2 import *
import sys
import os
# Settings for browser
class MyBrowser(QWebPage):
def __init__(self):
QWebPage.__init__(self)
# Specifies whether images are automatically loaded in web pages.
self.settings().setAttribute(QWebSettings.AutoLoadImages, False)
def userAgentForUrl(self, url):
return "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15"
class Name_Creater(QWebView):
def __init__(self):
QWebView.__init__(self)
self.setPage(MyBrowser())
self.loadFinished.connect(self.grab_first_name)
self.frame = self.page().mainFrame()
def grab_first_name(self):
html = unicode(self.frame.toHtml()).encode('utf-8')
soup = bs(html)
for name in soup.findAll('li', text=True):
print name
if __name__ == '__main__':
app = QApplication(sys.argv)
url_first_names = QUrl("http://www.genealogyroadtrip.com/Census/male_names_1.htm")
br = Name_Creater()
br.load(url_first_names)
br.show()
app.exec_()
The problem is that you are connecting to the
loadFinishedsignal of theQWebView, which will be emitted once for each page that is loaded. So if there are multiple frames, there will be multipleloadFinishedsignals emitted.The solution is to connect to the
loadFinishedsignal of the mainFrame: