# -*- coding: utf-8 -*-
# -*- coding: iso-8859-9 -*-
import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
from urllib2 import URLError
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse
def tagclean(url,Data=None):
html = urllib2.urlopen(url).read()
doc = fromstring(html)
tags = ['h1','h2','h3','h4','h5','h6',
'div', 'span',
'img', 'area', 'map']
args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False,
'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
cleaner = Cleaner(**args)
path = '/html/body'
body = doc.xpath(path)[0]
return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')
i will add this code the this main code:
DEFAULT_ENCODING = 'utf-8'
def parse_content_type(response):
try:
ctype = response.info()['Content-Type']
except KeyError:
raise URLError('No Content-Type defined.')
try:
ctype, encoding = ctype.split(';')
# encoding is now "charset=enc"
_, encoding = encoding.split('=')
except ValueError:
# no or wrong encoding definition, use default
encoding = DEFAULT_ENCODING
try:
ctype = ctype.split(';')[0]
except IndexError:
raise URLError('Could not parse Content-Type: "%s"' % ctype)
return ctype, encoding
//This function should return "text/html" as Content-Type.
ctype, encoding = parse_content_type(response)
if not ctype == 'text/html':
raise URLError('Wrong Content-Type: "%s"' % ctype)
good_stuff.encode('ascii', 'ignore')will delete your Turkish characters and anything else that’s not ASCII. Don’t do that. As you seem to be usingISO-8859-9, try using that to encode your Unicode characters.What operating system are you using?