I’m having issues using StreamWriter to code a scraper for a current project i’ve got. The loop i’ve coded is below
I’ve debugged all the variables coming into the loop and everything is set as it should be. When i pass in a url and the range to search through based on an ID GET variable in the url it fails to write the second sourceCode string
Could someone be kind enough to tell me if i’m not flushing something or is there something else at work here??
I’ve wrecked my head trying to find the root cause but its proving very stubborn
using System;
using System.IO;
using System.Windows.Forms;
namespace Scraper
{
public partial class Form1 : Form
{
Scraper scraper = new Scraper();
private StreamWriter sw;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
string url = textBox1.Text;
string[] urlBits = url.Split('.');
string[] domain = urlBits[2].Split('/');
string filepath = @"C:\Users\Herbaldinho\Desktop\"+urlBits[1]+"-"+domain[0];
string parentPath = @"C:\Users\Herbaldinho\Desktop\";
string newPath = Path.Combine(parentPath, filepath);
if (File.Exists(filepath))
{}
else
{
Directory.CreateDirectory(newPath);
}
DateTime today = DateTime.Today;
string curDate = String.Format("{0:ddd-MMM-dd-yyyy}", today);
string subPath = newPath + "\\" + curDate;
string newSubPath = Path.Combine(newPath, subPath);
if (File.Exists(subPath))
{ }
else
{
Directory.CreateDirectory(newSubPath);
}
string lower = textBox2.Text;
int lowerValue;
int.TryParse(lower, out lowerValue);
string upper = textBox3.Text;
int upperValue;
int.TryParse(upper, out upperValue);
int i;
for (i = lowerValue; i < upperValue; i++)
{
string filename = newSubPath+"\\Advert-"+i+".html";
string adPage = url + i;
bool write = scraper.UrlExists(adPage);
if (write)
{
string sourceCode = scraper.getSourceCode(adPage);
using (sw = new StreamWriter(filename))
{
sw.Write(sourceCode);
}
}
}
MessageBox.Show("Scrape Complete");
}
}
}
####This is the Scraper Object
using System.Net;
namespace Scraper
{
class Scraper
{
WebClient w = new WebClient();
public bool UrlExists(string url)
{
try
{
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.Method = "HEAD";
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
return (response.StatusCode == HttpStatusCode.OK);
}
catch
{
return false;
}
}
public string getSourceCode(string url)
{
string s = w.DownloadString(url);
return s;
}
}
}
Found the answer to the problem this morning
For anyone else having a similar problem, the try catch logic in the UrlExists method needs to close the response (response.Close())
From what i had understood it autoclosed but this is not the case
Hope this helps
Many thanks for the responses in helping me resolve this everyone