# -- coding: utf-8 -- # -- coding: iso-8859-9 -- import urllib2 import urllib

Question

0

Asked: May 26, 20262026-05-26T17:41:56+00:00 2026-05-26T17:41:56+00:00

# -- coding: utf-8 -- # -- coding: iso-8859-9 -- import urllib2 import urllib

0

# -*- coding: utf-8 -*-
# -*- coding: iso-8859-9 -*-

import urllib2
import urllib
from lxml.html import fromstring
from lxml.html.clean import Cleaner
from formatter import NullFormatter
from urllib2 import URLError
import cookielib
import urllib,time
import urlparse
import datetime
import new
from htmllib import HTMLParser
from lxml.html import fromstring
from lxml.html.clean import Cleaner
import urllib2
import sys,popen2,os
import urlparse

def tagclean(url,Data=None):

    html =  urllib2.urlopen(url).read()
    doc = fromstring(html)
    tags = ['h1','h2','h3','h4','h5','h6',
       'div', 'span', 
       'img', 'area', 'map']
    args = {'meta':False, 'safe_attrs_only':False, 'page_structure':False, 
       'scripts':True, 'style':True, 'links':True, 'remove_tags':tags}
    cleaner = Cleaner(**args)

    path = '/html/body'
    body = doc.xpath(path)[0]
    return cleaner.clean_html(body).text_content().encode('ascii', 'ignore')

i will add this code the this main code:

DEFAULT_ENCODING = 'utf-8'

def parse_content_type(response):
    try:
        ctype = response.info()['Content-Type']
    except KeyError:
        raise URLError('No Content-Type defined.')
    try:
        ctype, encoding = ctype.split(';')
        # encoding is now "charset=enc"
            _, encoding = encoding.split('=')
    except ValueError:
        # no or wrong encoding definition, use default
        encoding = DEFAULT_ENCODING
        try:
            ctype = ctype.split(';')[0]
        except IndexError:
            raise URLError('Could not parse Content-Type: "%s"' % ctype)

    return ctype, encoding

//This function should return "text/html" as Content-Type.

ctype, encoding = parse_content_type(response)

if not ctype == 'text/html':
    raise URLError('Wrong Content-Type: "%s"' % ctype)

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-05-26T17:41:57+00:00

Editorial Team

2026-05-26T17:41:57+00:00Added an answer on May 26, 2026 at 5:41 pm

good_stuff.encode('ascii', 'ignore') will delete your Turkish characters and anything else that’s not ASCII. Don’t do that. As you seem to be using ISO-8859-9, try using that to encode your Unicode characters.

What operating system are you using?

0

Reply
Share
Share

- Report

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

# -*- coding: utf-8 -*- # -*- coding: iso-8859-9 -*- import urllib2 import urllib

Leave an answerCancel reply

1 Answer

# -- coding: utf-8 -- # -- coding: iso-8859-9 -- import urllib2 import urllib

Leave an answer
Cancel reply