My code that I will post below gives me this error and I can’t figure out why or how to fix it. If anyone could help I would greatly appreciate it. Thanks!
Traceback (most recent call last):
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
main()
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
for final_url in pool.imap(handle_listing, listings):
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
val = self.waiters.get().wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
return self._exit_event.wait()
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
current.throw(*self._exc)
File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
result = function(*args, **kwargs)
File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
yellow_page = BeautifulSoup(download(yellow_page_url))
File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
BeautifulStoneSoup.__init__(self, *args, **kwargs)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
self._feed(isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
self._detectEncoding(markup, isHTML)
File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer
I don’t know what it wants or what it means…
This is my code:
from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib
def download(url):
print "Downloading:", url
s = urllib2.urlopen(url).read()
if s[:2] == '\x1f\x8b':
ifh = GzipFile(mode='rb', fileobj=StringIO(s))
s = ifh.read()
print "Downloaded: ", url
return s
def replace_chars(text, replacements):
return ''.join(replacements.get(x,x) for x in text)
def handle_listing(listing_url):
listing_document = BeautifulSoup(download(listing_url))
# ignore pages that link to yellowpages
if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
listing_title = listing_document.title.text
# define an alphabet
alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
if TITLE_MATCH.match(listing_title) is not None:
title, = TITLE_MATCH.match(listing_title).groups()
if ADDRESS_MATCH.match(listing_title) is not None:
address, = ADDRESS_MATCH.match(listing_title).groups()
yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
replace_chars(address, reps),
replace_chars(title, reps),
)
yellow_page = BeautifulSoup(download(yellow_page_url))
page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
if page_url:
page_url = page_url.a["href"]
business_name = title[:title.index(",")]
page = BeautifulSoup(download(page_url))
yellow_page_address = page.find("span", {"class" : "street-address"})
if yellow_page_address:
if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})
final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
pid, page_escaped)
return final_url
def main():
pool = eventlet.GreenPool()
listings_document = BeautifulSoup(download(START_URL))
listings = listings_document.findAll("a", href = LOCATION_LISTING)
listings = [listing['href'] for listing in listings]
for final_url in pool.imap(handle_listing, listings):
print final_url
"""
if str(final_url) is not None:
url = str(final_url)
req = urllib2.Request(url)
response = urllib2.urlopen(req)
page = response.read()
time.sleep(2)
"""
for a in range(0,1):
START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')
if __name__ == '__main__':
main()
A very common mistake made by novices using any language that supports exceptions is that they catch exceptions that they do not actually handle. This leads to hard-to-debug errors since it disrupts the normal flow of the program.
Specifically, catching
urllib2.HTTPErrorindownload()is preventing actual problems from being propagated to the rest of the program. Either remove the exception handler altogether, orraiseat the end of the handler to maintain flow.