I’m using IEqualityComparer to match “near duplicates” in a database using LINQ to Entities.
With a record set of around 40,000, this query is taking around 15 seconds to complete and I wondered if there were any structural changes that could be made to the code below.
My public method
public List<LeadGridViewModel> AllHighlightingDuplicates(int company)
{
var results = AllLeads(company)
.GroupBy(c => c, new CompanyNameIgnoringSpaces())
.Select(g => new LeadGridViewModel
{
LeadId = g.First().LeadId,
Qty = g.Count(),
CompanyName = g.Key.CompanyName
}).OrderByDescending(x => x.Qty).ToList();
return results;
}
Private method to grab the leads
private char[] delimiters = new[] { ' ', '-', '*', '&', '!' };
private IEnumerable<LeadGridViewModel> AllLeads(int company)
{
var items = (from t1 in db.Leads
where
t1.Company_ID == company
select new LeadGridViewModel
{
LeadId = t1.Lead_ID,
CompanyName = t1.Company_Name,
}).ToList();
foreach (var x in items)
x.CompanyNameStripped = string.Join("", (x.CompanyName ?? String.Empty).Split(delimiters));
return items;
}
My IEqualityComparer
public class CompanyNameIgnoringSpaces : IEqualityComparer<LeadGridViewModel>
{
public bool Equals(LeadGridViewModel x, LeadGridViewModel y)
{
var delimiters = new[] {' ', '-', '*', '&', '!'};
return delimiters.Aggregate(x.CompanyName ?? String.Empty, (c1, c2) => c1.Replace(c2, '\0'))
== delimiters.Aggregate(y.CompanyName ?? String.Empty, (c1, c2) => c1.Replace(c2, '\0'));
}
public int GetHashCode(LeadGridViewModel obj)
{
var delimiters = new[] {' ', '-', '*', '&', '!'};
return delimiters.Aggregate(obj.CompanyName ?? String.Empty, (c1, c2) => c1.Replace(c2, '\0')).GetHashCode();
}
}
One approach would be to create a computed column on the DB that is the company name with the unwanted characters strippoed out.
Then use this column to do your filtering on.
This will probably ever so slightly decrease performance on inserts, but should hugely improve the query time.