My code that I will post below gives me this error and I can’t

Question

0

Asked: May 27, 20262026-05-27T20:35:38+00:00 2026-05-27T20:35:38+00:00

My code that I will post below gives me this error and I can’t

0

My code that I will post below gives me this error and I can’t figure out why or how to fix it. If anyone could help I would greatly appreciate it. Thanks!

Traceback (most recent call last):
  File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 99, in <module>
    main()
  File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 76, in main
    for final_url in pool.imap(handle_listing, listings):
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenpool.py", line 232, in next
    val = self.waiters.get().wait()
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 166, in wait
    return self._exit_event.wait()
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\event.py", line 120, in wait
    current.throw(*self._exc)
  File "C:\Python27\lib\site-packages\eventlet-0.9.16-py2.7.egg\eventlet\greenthread.py", line 192, in main
    result = function(*args, **kwargs)
  File "C:\Users\Robert\Documents\j-a-c-o-b\newlc.py", line 48, in handle_listing
    yellow_page = BeautifulSoup(download(yellow_page_url))
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1519, in __init__
    BeautifulStoneSoup.__init__(self, *args, **kwargs)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1144, in __init__
    self._feed(isHTML=isHTML)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1168, in _feed
    smartQuotesTo=self.smartQuotesTo, isHTML=isHTML)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1770, in __init__
    self._detectEncoding(markup, isHTML)
  File "build\bdist.win32\egg\BeautifulSoup.py", line 1915, in _detectEncoding
    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data)
TypeError: expected string or buffer

I don’t know what it wants or what it means…

This is my code:

from gzip import GzipFile
from cStringIO import StringIO
import re
import webbrowser
import time
from difflib import SequenceMatcher
import os
import sys
from BeautifulSoup import BeautifulSoup
import eventlet
from eventlet.green import urllib2
import urllib2
import urllib

def download(url):
    print "Downloading:", url
    s = urllib2.urlopen(url).read()
    if s[:2] == '\x1f\x8b':
        ifh = GzipFile(mode='rb', fileobj=StringIO(s))
        s = ifh.read()
    print "Downloaded: ", url
    return s


def replace_chars(text, replacements):
    return ''.join(replacements.get(x,x) for x in text)

def handle_listing(listing_url):
    listing_document = BeautifulSoup(download(listing_url))

    # ignore pages that link to yellowpages
    if not listing_document.find("a", href=re.compile(re.escape("http://www.yellowpages.com/") + ".*")):
        listing_title = listing_document.title.text
        # define an alphabet
        alfa = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
        reps = {' ':'-', ',':'', '\'':'', '[':'', ']':'', '-Suite-' + alfa[1-26] : ''}
        if TITLE_MATCH.match(listing_title) is not None:
            title, = TITLE_MATCH.match(listing_title).groups()
            if ADDRESS_MATCH.match(listing_title) is not None:
                address, = ADDRESS_MATCH.match(listing_title).groups()
                yellow_page_url = "http://www.yellowpages.com/%s/%s?order=distance" % (
                    replace_chars(address, reps),
                    replace_chars(title, reps),
                )

                yellow_page = BeautifulSoup(download(yellow_page_url))

                page_url = yellow_page.find("h3", {"class" : "business-name fn org"})
                if page_url:
                    page_url = page_url.a["href"]

                    business_name = title[:title.index(",")]

                    page = BeautifulSoup(download(page_url))
                    yellow_page_address =  page.find("span", {"class" : "street-address"})
                    if yellow_page_address:

                        if SequenceMatcher(None, address, yellow_page_address.text).ratio() >= 0.5:
                            pid, = re.search(r'p(\d{5,20})\.jsp', listing_url).groups(0)
                            page_escaped = replace_chars(page_url, {':':'%3A', '/':'%2F', '?':'%3F', '=':'%3D'})

                            final_url = "http://www.locationary.com/access/proxy.jsp?ACTION_TOKEN=proxy_jsp$JspView$SaveAction&inPlaceID=%s&xxx_c_1_f_987=%s" % (
                                    pid, page_escaped)
                            return final_url


def main():

    pool = eventlet.GreenPool()
    listings_document = BeautifulSoup(download(START_URL))
    listings = listings_document.findAll("a", href = LOCATION_LISTING)
    listings = [listing['href'] for listing in listings]

    for final_url in pool.imap(handle_listing, listings):
        print final_url

        """
        if str(final_url) is not None:

            url = str(final_url)

            req = urllib2.Request(url)
            response = urllib2.urlopen(req)
            page = response.read()
            time.sleep(2)

        """

for a in range(0,1):

    START_URL = 'http://www.locationary.com/place/en/US/Arkansas/Fayetteville-page2/?ACTION_TOKEN=NumericAction'
    TITLE_MATCH = re.compile(r'(.*) \(\d{1,10}.{1,100}\)$')
    ADDRESS_MATCH = re.compile(r'.{1,100}\((.*), .{4,14}, United States\)$')
    LOCATION_LISTING = re.compile(r'http://www\.locationary\.com/place/en/US/.{1,50}/.{1,50}/.{1,100}\.jsp')

    if __name__ == '__main__':
        main()

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-27T20:35:38+00:00

A very common mistake made by novices using any language that supports exceptions is that they catch exceptions that they do not actually handle. This leads to hard-to-debug errors since it disrupts the normal flow of the program.

Specifically, catching urllib2.HTTPError in download() is preventing actual problems from being propagated to the rest of the program. Either remove the exception handler altogether, or raise at the end of the handler to maintain flow.

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

My code that I will post below gives me this error and I can’t

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply