I’m writing a python program to crawl twitter using a combination of urllib2, the python twitter wrapper for the api, and BeautifulSoup. However, when I run my program, I get an error of the following type:
ray_krueger
RafaelNadal
Traceback (most recent call last):
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 78, in <module>
crawl(start_follower, output, depth)
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 74, in crawl
crawl(y, output, in_depth - 1)
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 74, in crawl
crawl(y, output, in_depth - 1)
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 64, in crawl
request = urllib2.Request(new_url)
File "C:\Python28\lib\urllib2.py", line 192, in __init__
self.__original = unwrap(url)
File "C:\Python28\lib\urllib.py", line 1038, in unwrap
url = url.strip()
AttributeError: 'NoneType' object has no attribute 'strip'
I’m completely unfamiliar with this type of error (new to python) and searching for it online has yielded very little information. I’ve attached my code as well, but do you have any suggestions?
Thanx
Snehizzy
import twitter
import urllib
import urllib2
import htmllib
from BeautifulSoup import BeautifulSoup
import re
start_follower = "NYTimeskrugman"
depth = 3
output = open(r'C:\Python27\outputtest.txt', 'a') #better to use SQL database thanthis
api = twitter.Api()
#want to also begin entire crawl with some sort of authentication service
def site(follower):
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower):
thisfollowersite = site(follower)
request = urllib2.Request(thisfollowersite)
response = urllib2.urlopen(request)
return response
def getSoup(response):
html = response.read()
soup = BeautifulSoup(html)
return soup
def get_more_tweets(soup):
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def recordlinks(soup,output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def checkforstamp(soup):
times = nsoup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
if str(stamp) == '3 months ago':
return True
def crawl(follower, output, in_depth):
if in_depth > 0:
output.write(follower)
a = getPage(follower)
new_soup = getSoup(a)
recordlinks(new_soup, output)
currenttime = False
while currenttime == False:
new_url = get_more_tweets(new_soup)
request = urllib2.Request(new_url)
response = urllib2.urlopen(request)
new_soup = getSoup(response)
recordlinks(new_soup, output)
currenttime = checkforstamp(new_soup)
users = api.GetFriends(follower)
for u in users[0:5]:
x = u.screen_name
y = str(x)
print y
crawl(y, output, in_depth - 1)
output.write('\n\n')
output.write('\n\n\n')
crawl(start_follower, output, depth)
print("Program done. Look at output file.")
When you do
in
crawl(),new_urlisNone. As you’re gettingnew_urlfromget_more_tweets(new_soup), that meansget_more_tweets()is returningNone.That means
return dis never being reached, which means eitherstr(b) == 'more'was never true, orsoup.findAll()didn’t return any links sofor link in linksdoes nothing.