import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse
def tagclean(url,Data=None):
html = urllib2.urlopen(url).read()
doc = fromstring(html)
tags = ['h1','h2','h3','h4','h5','h6', 'div', 'span', 'img', 'area', 'map']
args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
cleaner = Cleaner(**args)
path = '/html/body'
body = doc.xpath(path)[0]
return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')
def writetofile(text,filename):
writefile = open(""+filename+"", "a")
writefile.write(str(text))
writefile.close()
if __name__=="__main__":
url = raw_input("Enter url:")
spliturl = url.split("http://")[1].replace(".","_")
metin=str(tagclean(url))
writetofile(text,spliturl+".txt")
And under my code url scanner:
def scanurl(url):
print "saving: ",url,datetime.datetime.now().strftime("%H:%M:%S")
tmp=urllib.urlretrieve(url)
print "saving finished",datetime.datetime.now().strftime("%H:%M:%S")
parser= HTMLParser(NullFormatter( ))
parser.feed( open(tmp[0]).read( ) )
urls=[]
for a in parser.anchorlist:
urls.append(urlparse.urljoin( url, a ))
return urls
I want to combine tagcleaner with this…
I’m going on a wild guess here since I don’t really know what you are trying to achieve, but don’t you mean to replace
with
If not, you really need to elaborate your question.