I have a UTF-8 xml file littered with codes like ÃÂ&xA7; for a cedilla etc
I have written the snippet below to remove or replace with acceptable values
1. Is there a better way to do this?
2. When I run this on some large XML files (>50MB) I may get Out of memory errors. If there is no better way how can I optimize it avoid OOM errors?
<cffile
action="read"
file="#ExpandPath('./xs.xml')#"
variable="myfile"/>
<cfset myfile =ReReplace(myfile,'&##xC2;&##x2013;','.','all')/>
<cfset myfile =ReReplace(myfile,'&##xC2;&##x2019;','''','all')/>
<cfset myfile =ReReplace(myfile,'&##xC2;&##x201D;','"','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x192;&##xC2;&##xA7;','c','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##xA7;','c','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##xA9;','e','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x201A;&##xC2;&##x2022;','(*)','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x192;&##xC2;&##x201A;\?','(*)','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x201A;&##xC2;&##xB7;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xC3;&##x201A;&##xC2;&##x2018;','''','all')/>
<cfset myfile =ReReplace(myfile,' &##xC3;&##x201A;&##xC2;&##x201C;',' "','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##x201C;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##x2122;','''','all')/>
<cfset myfile =ReReplace(myfile,' &##xE2;&##x20AC;&##x153;',' "','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##x153;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##xFFFD; ','" ','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##xFFFD;','-','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x201E;&##xA2;','(TM)','all')/>
<cfset myfile =ReReplace(myfile,'&##xE2;&##x20AC;&##xA2;','(*)','all')/>
<cfset myfile =ReReplace(myfile,'&##xEF;&##x201A;&##xA7;','(*)','all')/>
<cfset myfile =ReReplace(myfile,'(&##[^;]*;)','','all')/>
<cffile action="write"
file="#ExpandPath('./xs_new.xml')#"
output="#myfile#"/>
thanks
Use ColdFusion’s file functions to work on one line at a time, instead of reading the entire thing into memory: