public partial class Form1 : Form
{
int y = 0;
string url = @"http://www.google.co.il";
string urls = @"http://www.bing.com/images/search?q=cat&go=&form=QB&qs=n";
public Form1()
{
InitializeComponent();
//webCrawler(urls, 3);
List<string> a = webCrawler(urls, 1);
//GetAllImages();
}
private int factorial(int n)
{
if (n == 0) return 1;
else y = n * factorial(n - 1);
listBox1.Items.Add(y);
return y;
}
private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{
List<string> mainLinks = new List<string>();
if (document.DocumentNode.SelectNodes("//a[@href]") == null)
{ }
foreach (HtmlNode link in document.DocumentNode.SelectNodes("//a[@href]"))
{
var href = link.Attributes["href"].Value;
mainLinks.Add(href);
}
return mainLinks;
}
private List<string> webCrawler(string url, int levels)
{
HtmlAgilityPack.HtmlDocument doc;
HtmlWeb hw = new HtmlWeb();
List<string> webSites;// = new List<string>();
List<string> csFiles = new List<string>();
csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
csFiles.Add("current site name in this level is : "+url);
/* later should be replaced with real cs files .. cs files links..*/
doc = hw.Load(url);
webSites = getLinks(doc);
if (levels == 0)
{
return csFiles;
}
else
{
int actual_sites = 0;
for (int i = 0; i < webSites.Count() && i< 100000; i++) // limiting ourseleves for 20 sites for each level for now..
//or it will take forever.
{
string t = webSites[i];
/*
if (!webSites.Contains(t))
{
webCrawler(t, levels - 1);
}
*/
if ( (t.StartsWith("http://")==true) || (t.StartsWith("https://")==true) ) // replace this with future FilterJunkLinks function
{
actual_sites++;
csFiles.AddRange(webCrawler(t, levels - 1));
richTextBox1.Text += t + Environment.NewLine;
}
}
// report to a message box only at high levels..
if (levels==1)
MessageBox.Show(actual_sites.ToString());
return csFiles;
}
}
The exception is thrown after a few sites have been sent to the getLinks function.
The exception is in the getLinks function on the line:
foreach (HtmlNode link in document.DocumentNode.SelectNodes("//a[@href]"))
Object reference not set to an instance of an object
I tried to use there IF to check if its null then I did return mainLinks; which is a list.
But if I’m doing it I’m not getting all the links from the website.
Now I’m using urls in the constructor if I’m using url (www.google.co.il) I’m getting the same exception after few seconds.
I can’t figure out why this exception is throw up. Is there any reason for this exception ?
System.NullReferenceException was unhandled
Message=Object reference not set to an instance of an object.
Source=GatherLinks
StackTrace:
at GatherLinks.Form1.getLinks(HtmlDocument document) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 55
at GatherLinks.Form1.webCrawler(String url, Int32 levels) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 76
at GatherLinks.Form1.webCrawler(String url, Int32 levels) in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 104
at GatherLinks.Form1..ctor() in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Form1.cs:line 29
at GatherLinks.Program.Main() in D:\C-Sharp\GatherLinks\GatherLinks\GatherLinks\Program.cs:line 18
at System.AppDomain._nExecuteAssembly(Assembly assembly, String[] args)
at System.AppDomain.ExecuteAssembly(String assemblyFile, Evidence assemblySecurity, String[] args)
at Microsoft.VisualStudio.HostingProcess.HostProc.RunUsersAssembly()
at System.Threading.ThreadHelper.ThreadStart_Context(Object state)
at System.Threading.ExecutionContext.Run(ExecutionContext executionContext, ContextCallback callback, Object state)
at System.Threading.ThreadHelper.ThreadStart()
The problem appears to be that you’re testing for null but then doing nothing about it – here
I suspect you want to handle the null case but haven’t written the code to do it. You probably want something like:
you’d probably want to tidy up to something more like: