BTW… This website only works with Internet Explorer….
I am attempting to scrap a website for a client so that I can automate a task for them. Basically it will scrape for different reports and look for the turn around times and email them to the client. My scrapping program works fine, the problem I am having is logging into the website with Mechanize since the login form is using AJAX. I have look around for a solution but can’t seem to find exactly what i’m looking for.
Below is the HTML form and (from what i can tell) the AJAX that handles it.
function TranLogin() { var url = 'login.aspx?isAjax=true&eventTarget=TranLogin'; var postData = Form.serialize('Form1'); ajaxRequest = new Ajax.Request( url, { method : 'post', postBody : postData, onComplete : TransLoginFinished, onFailure : reportError, onException : reportException }); }
function TransLoginFinished(serverResponse) { if (requestFailed) return; xmlNodes = serverResponse.responseXML; usrSite = "8000"; usrCode = decodeXmlChar(xmlNodes.getElementsByTagName('UserCode')[0].text); if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '-1'){ alert(decodeXmlChar(xmlNodes.getElementsByTagName('FailMsg')[0].text)); Form.enable('Form1'); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '20'){ window.location.replace('initpasswd.aspx?usersite=' + usrSite + '&usercode=' + usrCode); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '14'){ window.location.replace('chgpasswd.aspx?type=chgpwd&usersite=' + usrSite + '&usercode=' + usrCode); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '16'){ window.location.replace('chgpasswd.aspx?type=pwdexpire&usersite=' + usrSite + '&usercode=' + usrCode); return; } if (xmlNodes.getElementsByTagName('LoginResult')[0].text == '0'){ if (xmlNodes.getElementsByTagName('PwdExpireWarning')[0].text == 'true'){ var changePwdNow = window.confirm(decodeXmlChar(xmlNodes.getElementsByTagName('PwdExpireMsg')[0].text)); if (changePwdNow == true){ window.location.replace('chgpasswd.aspx?type=chgpwd&usersite=' + usrSite + '&usercode=' + usrCode); return; } // var arg = { promptMsg :decodeXmlChar(xmlNodes.getElementsByTagName('PwdExpireMsg')[0].text), // buttons : [ { value : "Yes", rtnVal : 1 }, // { value : "No", rtnVal : 0 } // ] // }; // var rtn = window.showModalDialog('../Modules/ModalMessageBox.aspx',arg, "dialogHeight:140px;dialogWidth:500px; center:1;status:no;"); // if (rtn && rtn == 1){ // window.location.replace('chgpasswd.aspx?type=chgpwd&usersite=' + usrSite + '&usercode=' + usrCode); // return; // }
} if (JTrim($('txtHospCode').value) == '') { hospList = decodeXmlChar(xmlNodes.getElementsByTagName('HospList')[0].text).split('|'); if (hospList.length < 2) { selectedHospCode = hospList[0].split('-')[0]; TranSelectHosp(selectedHospCode); return; } $('divHospList').style.display = 'block';for(i=0;i<hospList.length;i++) { if (hospList[i] != '') { divHospCode = document.createElement("div"); divHospCode.className='divHospCode'; $('divHospListBG').appendChild(divHospCode); lnkHospCode = document.createElement("a"); if (hospList[i].length <= 33) lnkHospCode.innerText = hospList[i]; else lnkHospCode.innerText = hospList[i].substr(0,30) + '...'; lnkHospCode.title = hospList[i]; lnkHospCode.className = 'lnkHospCode'; divHospCode.appendChild(lnkHospCode); lnkHospCode.onmouseover = function(){this.style.color = '#000000';} lnkHospCode.onmouseout = function(){this.style.color = '#6c6c6c';} lnkHospCode.onclick = function(){TranSelectHosp(this.innerText.split('-')[0]);} if (i > 7 && $('divHospListBG').style.overflow != 'auto') { $('divHospListBG').style.height = '198px'; $('divHospListBG').style.overflow = 'auto'; } } } return; } else { TranSelectHosp(JTrim($('txtHospCode').value)); } } }<form name="Form1" method="post" action="login.aspx" id="Form1">Input Account Code:
<div class="divRight"> <input name="txtHospCode" type="text" id="txtHospCode" class="inputClass" maxlength="4" /> </div> <div class="divLeft"> <span>Input User Code:</span> </div> <div class="divRight"> <input name="txtUserCode" type="text" id="txtUserCode" class="inputClass" maxlength="6" /> </div> <div class="divLeft"> <span>Input Password:</span></div> <div class="divRight"> <input name="txtPassword" type="password" id="txtPassword" class="inputClass" /> </div> <div class="divLeft"> <span>Login As:</span> </div> <div class="divRight"> <input type="radio" name="rdLoginType" value="D" checked="checked" />Doctor <input type="radio" name="rdLoginType" value="T" />Other </div> <div class="divLeft"> </div> <div class="divRight"> <input class="buttonClass" id="btnOK" type="button" value="Enter" onclick="LoginIn();" /> <input class="buttonClass" id="btnReset" type="button" value="Reset" onclick="ResetInput();" /> </div>My code so far
import mechanize import cookielib from BeautifulSoup import BeautifulSoup import html2text import reInstatiate Browser
br = mechanize.Browser()
Cookie Jar
cj = cookielib.LWPCookieJar() br.set_cookiejar(cj)
Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False)
Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
User-Agent
br.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 6.0)')]
def login_to_website(login_url, login_form_name, usr_form_name, pwd_form_name,acct_code_name, usr, pwd, acct_code): """ Logs user into website """
# Open the url of the login page br.open(login_url) # Select the login form name br.select_form(login_form_name) # Enter user's credentials into the form br.form[acct_code_name] = acct_code br.form[usr_form_name] = usr br.form[pwd_form_name] = pwd br.find_control(name='rdLoginType').value = ['T'] # Submit the form print "Logging in as:", usr br.submit() # print current url print "We are now at:", br.geturl() # print error if br.geturl() == login_url: print "Login Failed" else: print "Successfully logged in"login_to_website('https://www.website.com', 'Form1', 'txtUserCode', 'txtPassword','txtHospCode', usr, pwd, acctCode)
Mechanize doesn’t, as far as I know, process Javascript. So your options are, in rough order of what I’d try: