I am attempting to write a spider using CasperJS used to parse pages and extract relevant links. The site in question features a hierarchical view of files and folders. I have written a recursive method to perform the navigation through the file structure, but after the first iteration of the method my array is returning null, even though function is running correctly. Any help identifying the problem would be really appreciated.
var processPage = function() {
//Gather links
var links = this.evaluate(function() { //links is the array being set to null
var elements = document.querySelectorAll("a");
return Array.prototype.map.call(elements, function(e) {
//check link matches our white list
var matchesWhitelist = false;
var fileDescription = e.querySelector("span").innerHTML;
console.log("span text:" + fileDescription);
//begin checking
if (fileDescription.indexOf('.mp3') != -1) matchesWhitelist = true;
//if (fileDescription.indexOf('.wmv') != -1) .... etc
//failing that is the link for a folder rather than a file
var hrefLink = e.getAttribute("href");
if (hrefLink.indexOf('folder-files') != -1) matchesWhitelist = true;
if (matchesWhitelist) {
console.log('match');
console.log('Adding link: ' + hrefLink)
return hrefLink;
}
else {
console.log('no match');
}
});
});
console.log("linkslength: " + links.length); // links will be null upon recursion
for (var i = 0; i < links.length; i++) {
//check link matches our 'whitelist'
this.thenOpen("https://TLD" + links[i]).then(function() {
this.echo("New URL: " + this.getCurrentUrl());
//check for files
if (this.exists(".fileDownload")) {
//extract link
} else {
//assume that this is a 'folder' link and send to be processed for more links
casper.then(processPage); //continue recursion
};
});
}
Thanks
You seem to want a reduced set.
This means you should use
.filterinstead of.map, and return a truthy value if you want to keep the link, and a falsey value if not.And if you don’t need the logging, you would just return
matchesWhitelist.So get rid of the logging, and you can reduce your filter to this.