When I use this code (adapted from Stephen Holiday code – thanks, Stephen for your code!):
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
"""
USSSALoader.py
"""
import os
import re
#import urllib2
from zipfile import ZipFile
import csv
import pickle
def getNameList():
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
for name in namesDict:
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
return names
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
file=zf.open(filename,'r')
rows=csv.reader(file, delimiter=',')
for row in rows:
name=row[0].upper()
# name=row[0].upper().encode('utf-8')
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
file.close()
print '\tImported %s'%filename
return names
if __name__ == "__main__":
getNameList()
I got this error:
iterator = raw_query.Run(**kwargs)
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\api\datastore.py", line 1622, in Run
itr = Iterator(self.GetBatcher(config=config))
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\api\datastore.py", line 1601, in GetBatcher
return self.GetQuery().run(_GetConnection(), query_options)
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\api\datastore.py", line 1490, in GetQuery
filter_predicate=self.GetFilterPredicate(),
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\api\datastore.py", line 1534, in GetFilterPredicate
property_filters.append(datastore_query.make_filter(name, op, values))
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\datastore\datastore_query.py", line 107, in make_filter
properties = datastore_types.ToPropertyPb(name, values)
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\api\datastore_types.py", line 1745, in ToPropertyPb
pbvalue = pack_prop(name, v, pb.mutable_value())
File "C:\Program Files (x86)\Google\google_appengine\google\appengine\api\datastore_types.py", line 1556, in PackString
pbvalue.set_stringvalue(unicode(value).encode('utf-8'))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe1 in position 1: ordinal not in range(128)
This happens when I have names with non-ASCII caracters (like “Chávez” or “Barañao”). I tried to fix this problem doing this:
for row in rows:
# name=row[0].upper()
name=row[0].upper().encode('utf-8')
gender=genderMap[row[1]]
count=int(row[2])
But, then, I got this other error:
File "C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py", line 17, in getNameList
namesDict=extractNamesDict()
File "C:\Users\CG\Desktop\Google Drive\Sci&Tech\projects\naivebayes\USSSALoader.py", line 43, in extractNamesDict
name=row[0].upper().encode('utf-8')
UnicodeDecodeError: 'ascii' codec can't decode byte 0xed in position 3: ordinal not in range(128)
I also tried this:
def extractNamesDict():
zf=ZipFile('names.zip', 'r', encode='utf-8')
filenames=zf.namelist()
But ZipFile doesn’t have such argument.
So, how to fix that avoiding this UnicodeDecodeError for non-ASCII names?
I’m using this code with GAE.
It looks like your first traceback is AppEngine-related. Are you building a loader that will populate the datastore? If so, seeing the code that comprises the models and does the
put‘ing would be helpful. I will probably be corrected by someone, but in order for that piece to work I believe you actually need todecodeinstead ofencode(i.e. when you read the sheet prior to theput, convert the string to unicode by usingdecode('utf-8')ordecode('latin1'), depending on your situation).As far as your local code, I won’t pretend to know the deep internals of Unicode handling, but I’ve generally used
decode()andencode()to handle these types of situations. I believe the correct encoding to use depends on the underlying text (meaning you’d need to know if it were encodedutf-8orlatin-1, etc.). Here is a quick test with your example:In this case, I needed to use
latin1to decode the encoded string (I was using the terminal), but in your situation usingutf-8may very well work.