This is a crawler that prints all the URLs available on a given link.
#!C:/Python27/python.exe -u
import urllib
import cgi,cgitb
cgitb.enable()
print "Content-Type: text/html\n\n"
def get_page(url):
try:
return urllib.urlopen(url).read()
except:
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(a, b):
for e in b:
if e not in a:
a.append(e)
def add_page_to_index(index, url, content):
words = content.split()
for word in words:
add_to_index(index, word, url)
def add_to_index(index, keyword, url):
if keyword in index:
index[keyword].append(url)
else:
index[keyword] = [url]
def lookup(index, keyword):
if keyword in index:
return index[keyword]
else:
return None
def crawl_web(seed): # returns index, graph of inlinks
tocrawl = [seed]
crawled = []
graph = {} # <url>, [list of pages it links to]
index = {}
while tocrawl:
page = tocrawl.pop()
if page not in crawled:
content = get_page(page)
add_page_to_index(index, page, content)
outlinks = get_all_links(content)
graph[page] = outlinks
union(tocrawl, outlinks)
crawled.append(page)
return index, graph
index, graph = crawl_web('http://www.bing.com/results.asp?q=fulcrum')
print graph
print """
<html>
<body>
Animesh Pandey
</body>
</html>
"""
print "<br>"
print graph
print "<br>"
print index
print "<br>"
print tocrawl
print "<br>"
print seed
This python file runs fine on an online interpreter! At least it gives some result ….
but when running on a browser it always gives a timeout!!
I am using Apache 2.2.11 and Python 2.7.3.
Please tell me what should I try to do ???
This is the correct version of the code.
This prints the urls on that static web page but this code is not suitable for sites with many links!