Before I write anything, I would like to thank the sources for their code snippets from which I compiled my version. (Norvig Spelling Correction, Lorenzo Stokes, Website: http://www.codegrunt.co.uk/2010/11/02/C-Sharp-Norvig-Spelling-Corrector.html, Accessed: September 2011) and (Levenshtein Distance Algorithm, Lasse Johansson, Website: http://www.merriampark.com/ldcsharp.htm, Accessed: September 2011)
This search will first correct any spelling mistakes in your search query, then it will try to get a list of exact matches, partial matches and finally a distance query. The distance query will use the Levenshtein Distance Algorithm to find words that you may have meant.
Firstly, you need the big.txt file http://norvig.com/big.txt
The Dictionary that it creates from this file takes about 1.5s to load, so I would inject this at the start of your application via a container. In this case I am using Autofac
var bigDictionary = Regex.Matches(Resources.big.ToLower(), "[a-z]+")
.Cast<Match>()
.GroupBy(m => m.Value, m => m.Value)
.ToDictionary(gr => gr.Key, gr => gr.Count());
builder.RegisterInstance(bigDictionary)
.Named<IDictionary<string, int>>("BigTxt")
.AsSelf()
.SingleInstance();
builder.Register(c => new SearchService(c.ResolveNamed<IDictionary<string, int>>("BigTxt"))).As(typeof(ISearchService))
.InstancePerHttpRequest();
Second we need the Search Service
UPDATE: I have refactored this into a Self Contained Service
using StrEnum = System.Collections.Generic.IEnumerable<string>;
public class SearchService
{
private readonly IDictionary<string, int> bigDictionary;
private const string ALPHABET = "abcdefghijklmnopqrstuvwxyz";
public SearchService(IDictionary<string, int> bigDictionary)
{
this.bigDictionary = bigDictionary;
}
public StrEnum SearchThroughList(string searchTerm, StrEnum itemList)
{
var matches = new List<string>();
string[] words = searchTerm.Split(' ');
string term = string.Join(" ", words.Select(GetPossibleMisspelledWord));
matches.AddRange(GetClosestMatch(term, itemList));
if (matches.Count() == 0)
{
matches.AddRange(GetDistanceMatch(itemList, term));
}
return matches.Distinct();
}
private static StrEnum GetClosestMatch(string word, StrEnum titles)
{
var matches = new List<string>();
string exactMatch = titles.SingleOrDefault(x => x.ToLowerInvariant() == word.ToLowerInvariant());
if (exactMatch != null)
matches.Add(exactMatch);
StrEnum partialMatches =
titles.Where(x => x.ToLowerInvariant().Contains(word.ToLowerInvariant()));
matches.AddRange(partialMatches);
return matches;
}
private static StrEnum GetDistanceMatch(StrEnum titles, string word)
{
var distanceDictionary = new Dictionary<string, int>();
titles.ToList().ForEach(x => distanceDictionary.Add(x, Distance(x, word)));
return distanceDictionary
.Where(y => y.Value == distanceDictionary.Values.Min()).Select(x => x.Key).Take(2);
}
private string GetPossibleMisspelledWord(string word)
{
var nWords = bigDictionary;
Func<StrEnum, StrEnum> nullIfEmpty = c => c.Any() ? c : null;
StrEnum candidates = nullIfEmpty(new[] { word }.Where(nWords.ContainsKey))
?? nullIfEmpty(Edits(word).Where(nWords.ContainsKey))
?? nullIfEmpty((Edits(word).SelectMany(Edits, (e1, e2) => new { e1, e2 })
.Where(match => nWords.ContainsKey(match.e2))
.Select(match => match.e2))
.Distinct());
return candidates == null ? word : candidates
.OrderByDescending(cand => (nWords.ContainsKey(cand) ? nWords[cand] : 1)).First();
}
private static int Distance(string s, string t)
{
var n = s.Length;
int m = t.Length;
var d = new int[n + 1, m + 1];
if (n == 0) return m;
if (m == 0) return n;
for (int i = 0; i <= n; d[i, 0] = i++) {}
for (int j = 0; j <= m; d[0, j] = j++) {}
for (int i = 1; i <= n; i++)
{
for (int j = 1; j <= m; j++)
{
int cost = (t.Substring(j - 1, 1) == s.Substring(i - 1, 1) ? 0 : 1);
d[i, j] = Math.Min(Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
return d[n, m];
}
private static StrEnum Edits(string w)
{
// Deletion
return (from i in Enumerable.Range(0, w.Length)
select w.Substring(0, i) + w.Substring(i + 1))
// Transposition
.Union(from i in Enumerable.Range(0, w.Length - 1)
select w.Substring(0, i) + w.Substring(i + 1, 1) +
w.Substring(i, 1) + w.Substring(i + 2))
// Alteration
.Union(from i in Enumerable.Range(0, w.Length)
from c in ALPHABET
select w.Substring(0, i) + c + w.Substring(i + 1))
// Insertion
.Union(from i in Enumerable.Range(0, w.Length + 1)
from c in ALPHABET
select w.Substring(0, i) + c + w.Substring(i));
}
}
This would produce the following results:
search for afterlfe
will match:
Afterlife
The Afterlife
An Afterlife with Jane
a search for Tip Gaer
will match:
Top Gear
Top Gear Series 2
Top Gear Polar Special
a search for Tip Gaer Poler spfoul
will match:
Top Gear Polar Special