Counting Word Frequency With Python
A Python script to crudely tally the number of times a space-delimited word appears in a text file. Uses a Counter object to track the number of occurrences for each word.
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 26 20:04:11 2012
Returns the most-common space-delimited words in a file.
@author: robert
"""
from collections import Counter
import re
def openfile(filename):
fh = open(filename, "r+")
str = fh.read()
fh.close()
return str
def removegarbage(str):
# Replace one or more non-word (non-alphanumeric) chars with a space
str = re.sub(r'\W+', ' ', str)
str = str.lower()
return str
def getwordbins(words):
cnt = Counter()
for word in words:
cnt[word] += 1
return cnt
def main(filename, topwords):
txt = openfile(filename)
txt = removegarbage(txt)
words = txt.split(' ')
bins = getwordbins(words)
for key, value in bins.most_common(topwords):
print key,value
main('speech.txt', 500)
Example output:
the 235 and 161 to 132 of 125 that 101 a 91 in 83 we 70 is 54 our 40 who 39 for 39 people 37 not 36 are 32 on 29 it 28 be 28 their 27 must 26 have 25 those 25 will 25 with 25 as 22 world 21 this 19 all 18 america 18 because 18 from 17 they 17 i 17 an 17
(…)