Below is a small snippet of code I have for my twitter crawler mechanism:
from BeautifulSoup import BeautifulSoup
import re
import urllib2
url = 'http://mobile.twitter.com/NYTimesKrugman'
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
print tag.renderContents()
print ('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib2.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'stamp' + str(stamp)
print 'tweets' +str (tweets)
while (stamp is False) and (tweets is True):
b = getnewlink(soup)
print b
red = urllib2.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
The problem is, after my twitter crawler hits about 3 months of tweets, I would like it to stop going to the next page of a user. However, it does not appear to be doing that. It seems to continually going searching for the next page of tweets. I believe this is due to the fact that checkstamp keeps evaluating to False. Does anyone have any suggestions as to how I can modify the code so that the crawler keeps looking for the next page of tweets as long as there are more tweets (verified by are_more_tweets mechanism) and it hasn’t hit 3 months of tweets yet??? Thanks!
EDIT – Please see below:
from BeautifulSoup import BeautifulSoup
import re
import urllib
url = 'http://mobile.twitter.com/cleversallie'
output = open(r'C:\Python28\testrecursion.txt', 'a')
def gettweets(soup):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
print(b)
print('\n\n')
def are_more_tweets(soup):#to check whether there is more than one page on mobile twitter
links = soup.findAll('a', {'href': True}, {id: 'more_link'})
for link in links:
b = link.renderContents()
test_b = str(b)
if test_b.find('more'):
return True
else:
return False
def getnewlink(soup): #to get the link to go to the next page of tweets on twitter
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def checkforstamp(soup): # the parser scans a webpage to check if any of the tweets are older than 3 months
times = soup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
test_stamp = str(stamp)
if not (test_stamp[0]) in '0123456789':
continue
if test_stamp == '3 months ago':
print test_stamp
return True
else:
return False
response = urllib.urlopen(url)
html = response.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
while (not stamp) and (tweets):
b = getnewlink(soup)
print b
red = urllib.urlopen(b)
html = red.read()
soup = BeautifulSoup(html)
gettweets(soup)
stamp = checkforstamp(soup)
tweets = are_more_tweets(soup)
print 'done'
Your
soup.findall()is picking up an image tag in a link that matches your pattern (has anhrefattribute andclassstatus-link).Instead of always
returning on the very first link, try:Which will skip the link if it doesn’t start with a number, so you might actually get to the right link. Keep that
printstatement in there so you can see if you hit some other kind of link that starts with a number that you also need to filter out.Edit: What you were doing was always returning on the very first item in
times. I changed it so it ignored any links that didn’t start with a number.However, this would cause it to return
Noneif it didn’t find any links with a number. This would work fine, except you changedwhile not stamp and tweetstowhile stamp is False and tweets is True. Change it back towhile not stamp and tweetsand it will correctly treatNoneandFalseas the same, and it should work.