Looking For Words With an Edit Distance of 1 or 2 From Other Words

This code is a modification of Peter Norvig’s spelling corrector that adds the closest_nearby_word() method, which identifies the most-frequently-seen correctly-spelled word that has an edit distance of 1 or 2 from the given correctly-spelled word.

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""

An altered version of Peter Norvig's spelling corrector
(Source: http://norvig.com/spell-correct.html)

"""
from collections import Counter
import re

# Get the whitespace-delimited words from a text, minus any punctuation
def words(text): return re.findall('[a-z]+', text.lower())

# Count the frequency with which each word occurs
def train(features):
 model = Counter()
 for f in features:
 model[f] += 1
 return model

# Run training using a book with words we'll consider to be spelled correctly
NWORDS = train(words(file('big.txt').read()))

# Get strings with an edit distance of 1 from the given word
def edits1(word):
 alphabet = 'abcdefghijklmnopqrstuvwxyz'
 splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
 deletes = [a + b[1:] for a, b in splits if b]
 transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
 replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b]
 inserts = [a + c + b for a, b in splits for c in alphabet]
 return set(deletes + transposes + replaces + inserts)

# Get strings with an edit distance of 2 from the given word
def known_edits2(word):
 return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

# Get any known words matching the given mutated words
def known(words):
 return set(w for w in words if w in NWORDS)

# Suggest a correction by mutating a word and choosing the most likely replacement
# based on how often the mutated words appear in the trained model NWORDS
def correct(word):
 candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
 print candidates
 return max(candidates, key=NWORDS.get)

# Get the likeliest correctly-spelled word
def closest_nearby_word(word):
 nearby = set()
 for e in known(edits1(word) or known_edits2(word)):
 if (e != word):
 nearby.add(e)
 if not nearby: return set()
 return max(nearby, key=NWORDS.get)

#print correct('speling')

# Run some test cases for finding "nearby" words
for w in frozenset(['rational', 'woman', 'rogue', 'effect', 'started', 'rein',
 'scalded', 'mislead', 'reality', 'whit', 'marshal', 'voila',
 'aide', 'tiered', 'county', 'fires', 'stated', 'soldier',
 'beset', 'affect', 'vice', 'wreck', 'spayed', 'complimentary',
 'their', 'principal', 'moral', 'especially', 'steal',
 'personal', 'why', 'heroine', 'descendant', 'baited',
 'interested', 'sole', 'think', 'physics', 'corps', 'discrete']):
 print w, "-", closest_nearby_word(w)

RESULTS

With a better training corpus, this method could possibly be used to identify misspelled words that are overlooked by most spell checkers. But better starting points are available.

think - thing
corps - crops
stated - states
baited - waited
aide - side
beset - best
fires - fire
scalded - scolded
moral - morel
whit - what
principal - principals
wreck - wrack
personal - set([])
heroine - heroin
reality - set([])
their - theirs
interested - set([])
voila - set([])
woman - women
rational - national
started - stated
sole - some
effect - effects
rogue - vogue
affect - effect
why - who
descendant - descendants
county - count
spayed - stayed
especially - specially
vice - voice
physics - set([])
discrete - discreet
tiered - tired
mislead - misled
soldier - soldiers
rein - vein
complimentary - complementary
steal - steel
marshal - marshall