I have been using the following script to create sitemaps for my clients websites. The issue is it does not work for every site. I have found that many if not all the sites hosted on godaddy do not spider. If anyone can see an error in my script or know what is causing the fault I would greatly appreciate the help.
Thanks in advance
set_time_limit(0);
class spider_man
{
var $url;
var $limit;
var $cache;
var $crawled;
var $banned_ext;
var $domain;
function spider_man( $url, $banned_ext, $limit ){
$this->domain = $url;
$this->url = 'http://'.$url ;
$this->banned_ext = $banned_ext ;
$this->limit = $limit ;
if( !fopen( $this->url, "r") ) return false;
else $this->_spider($this->url);
}
function _spider( $url ){
$this->cache = @file_get_contents( urldecode( $url ) );
if( !$this->cache ) return false;
$this->crawled[] = urldecode( $url ) ;
preg_match_all( "#href=\"(https?://[&=a-zA-Z0-9-_./]+)\"#si", $this->cache, $links );
if ( $links ) :
foreach ( $links[1] as $hyperlink ){
if(strpos($hyperlink,$this->domain)===false){ break; }
else{
$this->limit--;
if( ! $this->limit ) return;
if( $this->is_valid_ext( trim( $hyperlink ) ) and !$this->is_crawled( $hyperlink ) ) :
$this->crawled[] = $hyperlink;
echo "Crawling $hyperlink<br />\n";
unset( $this->cache );
$this->_spider( $hyperlink );
endif;
}
}
endif;
}
function is_valid_ext( $url ){
foreach( $this->banned_ext as $ext ){
if( $ext == substr( $url, strlen($url) - strlen( $ext ) ) ) return false;
}
return true;
}
function is_crawled( $url ){
return in_array( $url, $this->crawled );
}
}
$banned_ext = array(".dtd",".css",".xml",".js",".gif",".jpg",".jpeg",".bmp",".ico",".rss",".pdf",".png",".psd",".aspx",".jsp",".srf",".cgi",".exe",".cfm");
$spider = new spider_man( 'domain.com', $banned_ext, 100 );
print_r( $spider->crawled );
When you access a site using fopen() of file_get_contents() you don’t send AGENT or REFERRER or other header information. It’s blatently obvious that this is an automated script.
You need to look at sending context with your fopen (check the docs and read the context section) or, better still, using CURL. This allows you to set the agent and referrer headers to simulate a browser.