using System;
using System.IO;
using System.Net;
using System.Text.RegularExpressions;
namespace Working
{
class Program4
{
static string errorurl =
"http://www.realtor.ca/propertyDetails.aspx?propertyId=8692663";
static void Main(string[] args)
{
string s;
s = getWebpageContent(errorurl);
s = removeNewLineCharacters(s);
getFields(s);
Console.WriteLine("End");
}
public static void getFields(string html)
{
Match m;
string fsRE = @"ismeasurement.*?>.*?(\d+).*?sqft";
m = Regex.Match(html, fsRE, RegexOptions.IgnoreCase);
}
private static string removeNewLineCharacters(string str)
{
string[] charsToRemove = new string[] { "\n", "\r" };
foreach (string c in charsToRemove)
{
str = str.Replace(c, "");
}
return str;
}
static string getWebpageContent(string url)
{
WebClient client = new WebClient();
client.Headers.Add("user-agent",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2;
.NET CLR 1.0.3705;)");
Stream data = client.OpenRead(url);
StreamReader reader = new StreamReader(data);
string s = reader.ReadToEnd();
data.Close();
reader.Close();
return s;
}
}
}
This program hangs. It runs correctly when I remove RegexOptions.IgnoreCase option or
when I remove call to removeNewLineCharacters() function.
Could someone tell me what is going on, please?
Seeing so many lazy matches, I guess too much time is spent on backtracking.
Try to refactor it into not using lazy matches, e.g.
The reason removing
RegexOptions.IgnoreCaseworks is because there is only the string “isMeasurement” in that page. The reason removingremoveNewLineCharactersworks is because.doesn’t match new lines so it can stop early.(BTW, why are you matching HTML with Regex?)