import urllib2 import urllib from lxml.html import fromstring from lxml.html.clean import Cleaner from formatter

Question

0

Asked: May 26, 20262026-05-26T13:21:23+00:00 2026-05-26T13:21:23+00:00

import urllib2 import urllib from lxml.html import fromstring from lxml.html.clean import Cleaner from formatter

0

import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse

def tagclean(url,Data=None):    
    html = urllib2.urlopen(url).read()
    doc = fromstring(html)
    tags = ['h1','h2','h3','h4','h5','h6', 'div', 'span', 'img', 'area', 'map']
    args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
            'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
    cleaner = Cleaner(**args)

    path = '/html/body'
    body = doc.xpath(path)[0]
    return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')

def writetofile(text,filename):
    writefile = open(""+filename+"", "a")
    writefile.write(str(text))
    writefile.close()

if __name__=="__main__":    
    url = raw_input("Enter url:")
    spliturl = url.split("http://")[1].replace(".","_")
    metin=str(tagclean(url))
    writetofile(text,spliturl+".txt")

And under my code url scanner:

def scanurl(url):   
    print "saving: ",url,datetime.datetime.now().strftime("%H:%M:%S")
    tmp=urllib.urlretrieve(url)
    print "saving finished",datetime.datetime.now().strftime("%H:%M:%S")
    parser= HTMLParser(NullFormatter( ))
    parser.feed( open(tmp[0]).read( ) )
    urls=[]
    for a in parser.anchorlist:
        urls.append(urlparse.urljoin( url, a ))
return urls

I want to combine tagcleaner with this…

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-26T13:21:23+00:00

Editorial Team

2026-05-26T13:21:23+00:00Added an answer on May 26, 2026 at 1:21 pm

I’m going on a wild guess here since I don’t really know what you are trying to achieve, but don’t you mean to replace

parser.feed( open(tmp[0]).read( ) )

with

parser.feed( tagclean(url) )

If not, you really need to elaborate your question.

0

Reply
Share
Share

- Report

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

import urllib2 import urllib from lxml.html import fromstring from lxml.html.clean import Cleaner from formatter

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply