« お題1:ファイルの同期 | メイン | お題3:シングルトン »

お題2:単語数カウント

英文のテキストファイルを読み込んで単語の出現数を数えるプログラムを作れ。


例えばテキストファイルの中身が「It's fine day, isn't it? Yes, it is!」ならばit'sが1回、fineが1回、dayが1回、isn'tが1回、itが2回、yesが1回、isが1回となるように数えてよい。
(isn'tにはisが含まれているな、とか、It'sの'sはisの省略形だな、などと判断するのはとてもむずかしいので)


余力があれば出現頻度の多い順に出力するプログラムも書け。

トラックバック

このエントリーのトラックバックURL:
http://www.nishiohirokazu.org/mt/mt-tb.cgi/599

この一覧は、次のエントリーを参照しています: お題2:単語数カウント:

» [python] お題2: 単語数のカウント 送信元 Fomalhaut of Piscis Australis
単語数のカウントです。 頻度順じゃなくてアルファベット順でソートしてます orz あとで書き直すかも。 [詳しくはこちら]

コメント (12)

# -*- encoding:utf-8 -*-

"""
>>> theme2_1("It's fine day, isn't it? Yes, it is!")
{"it's": 1, 'is': 1, 'it': 2, 'yes': 1, 'fine': 1, 'day': 1, "isn't": 1}
>>> theme2_2(theme2_1("It's fine day, isn't it? Yes, it is!"))
[('it', 2), ('day', 1), ('fine', 1), ('is', 1), ("isn't", 1), ("it's", 1), ('yes', 1)]
"""
import re

def theme2_1(target):
    """ 単語数え上げ
    """
    p = re.compile(r"[^'A-Za-z]|\s")
    d = dict()
    for s in p.split(target):
        if len(s) != 0:
            s = s.lower()
            d[s] = d.get(s, 0) + 1
    return d

def theme2_2(countDict):
    """単語数え上げの結果をソート
    出現順、単語アルファベット順
    """
    return sorted([(k,v) for k,v in countDict.iteritems()],
                  lambda x,y: x[1] == y[1] and cmp(x[0],y[0]) or cmp(y[1],x[1]))


if __name__ == '__main__':
    import doctest
    doctest.testmod()
    

磯野フネ:
a=open('text.txt').read()
for x in',.!?():':a=a.replace(x,'')
a=a.split()
t={}
for x in a:
 if x not in t:t[x]=1
 else:t[x]+=1
a=t.items()
a.sort(lambda x,y:y[1]-x[1])
for x,y in a:print y,x
海坊主:
import sys
import re

def main():
    if len(sys.argv) != 2:
        sys.stderr.write("specify text file")
        return
    counter = {}
    for line in open(sys.argv[1]):
        for word in re.findall("(?:\w|')+", line):
            word = word.lower()
            counter[word] = counter.get(word, 0) + 1
    for word, count in sorted(counter.iteritems(), key=lambda a:(-a[1], a[0])):
        print word, count

if __name__ == '__main__':
    main()


HS:
import fileinput
import re

word_count = {}

for line in fileinput.input():
  for word in re.findall(r"[\w']+", line):
    word = word.lower()
    word_count[word] = word_count.get(word, 0) + 1

for word, count in sorted(word_count.items(), key=sorted, reverse=True):
    print word, count

にしお:
ちなみにテストデータとして英語の長い文章がほしければ
http://www.gutenberg.org/etext/108
こんなのはどうでしょう。
bonlife:
import re

def word_count(f):
    """Count words from file object."""

    p = re.compile(r"[^a-z']", re.IGNORECASE)
    word_count_dic = {}

    for line in f:
        for i in p.split(line):
            s = i.lower()
            if len(s) != 0:
                word_count_dic[s] = word_count_dic.get(s, 0) + 1
    return word_count_dic

def order_word_count(d, order="desc"):
    """Order word_count() result by value."""

    if order == "desc":
        return sorted([(k,v) for k,v in d.iteritems()],
                      lambda x,y: x[1] == y[1] and cmp(x[0],y[0]) or cmp(y[1],x[1]))
    elif order == "asc":
        return sorted([(k,v) for k,v in d.iteritems()],
                      lambda x,y: x[1] == y[1] and cmp(x[0],y[0]) or cmp(x[1],y[1]))

if __name__ == "__main__" :
#    from StringIO import StringIO
#    str = """It's fine day, isn't it? Yes, it is!"""
#    for k, v in order_word_count(word_count(StringIO(str))):
#        print " %-30s : %5s" % (k, v)
    import urllib2
    for k, v in order_word_count(word_count(urllib2.urlopen('http://www.gutenberg.org/etext/108'))):
        print " %-30s : %5s" % (k, v)

pepsilove:
import sys
import re

def cntwrd(wordlist):
    if len(wordlist) == 0:
        return
    tmplist = wordlist
    sortlist = []
    while 1:
        if len(wordlist) == 0:
            break
        c = wordlist[0]
        tmplist = [ x for x in wordlist if x != c ]
        if len(c) > 0:
            sortlist.append([c, len(wordlist) - len(tmplist)])
        wordlist = tmplist
    return sortlist

def printlist(list, n):
    slist = [ (x[n], x) for x in list ]
    slist.sort()
    for x in [ v for (k,v) in slist ]:
        print x[0],":",x[1]

strAlltext = [ x for x in sys.stdin ]
lstAlltext = re.split("[^a-zA-Z0-9']", " ".join(strAlltext))

if __name__ == "__main__" :
    sortlist = cntwrd(lstAlltext)
    printlist(sortlist, 1)

morchin:
# -*- coding: utf-8 -*-
import re, operator
	
def word_count(f):
    """単語と出現回数の辞書を作る。単語に数字を含めず大文字小文字も区別しない"""
    p = re.compile(r"[^'a-zA-Z]+")
    words = {}
    for word in p.split(f):
        if not word: continue
        word = word.lower()
        words[word] = words.get(word, 0) + 1
    return words
	
def print_words(words):
    """出現頻度順に出力。但し単語は辞書順に並べない"""
    for word, cnt in sorted(words.items(), key=operator.itemgetter(1), reverse=True):
        print '%s: %d' % (word, cnt)
	
if __name__ == '__main__':
    print_words(word_count("It's fine day, isn't it? Yes, it is!"))
    #import urllib2
    #print_words(word_count(
    #	urllib2.urlopen('http://www.gutenberg.org/etext/108').read()
    #))

__unko__:
# -*- encoding:sjis -*-
import sys
import re

def wc(str):
	p = re.compile("[!|?|,|.|'|\"]");
	str = p.sub("", str);
	sp = str.split(" ");
	
	dec = {};
	for w in sp:
		if len(w) > 0:
			dec[w]=0;
	
	cnt = 0;
	for w in sp:
		if len(w) > 0:
			dec[w] += 1;
	
	print dec;

if __name__=="__main__":
	try:
		f = open(sys.argv[1], "r");
		str = "";
		for l in f:
			if len(l) > 0:
				str += l;
		
		str = str.replace("\n", " ");
	except:
		str = "It's fine day, isn't it? Yes, it is!";
	
	wc(str);

odz:
import sys, re, fileinput
import itertools

WORD_PATTERN = re.compile(r'[\w][\w\']*')
def words(s):
    return WORD_PATTERN.findall(s)

def normalized_words(s):
    return (word.lower() for word in words(s))

def main(args):
    fp = fileinput.input(args)
    counts = dict()
    for word in itertools.chain(*(normalized_words(line) for line in fp)):
        counts[word] = counts.get(word, 0) + 1
    items = counts.items()
    items.sort(lambda x, y: cmp(y[1], x[1]))
    for word, count in items:
        print '%-5d %s' % (count, word)

if __name__ == '__main__':
    main(sys.argv[1:])


odz:
import sys, re, fileinput

def chain(iterable):
    for it in iterable:
        for e in it:
            yield e

WORD_PATTERN = re.compile(r"\w[\w']*")
def words(s):
    return WORD_PATTERN.findall(s)

def counts(iterable):
    counts = {}
    for item in iterable:
        counts[item] = counts.get(item, 0) + 1
    return counts

def sort_by_value(dct, compare = cmp):
    items = dct.items()
    items.sort(lambda x, y: compare(x[1], y[1]))
    return items

def main(args):
    fp = fileinput.input(args)
    try:
        ws = chain((w.lower() for w in words(line)) for line in fp)
        cs = counts(ws)
        for word, count in reversed(sort_by_value(cs)):
            print '%s\t%5d' % (word, count)

    finally:
        fp.close()

if __name__ == '__main__':
    main(sys.argv[1:])

import re


def main():
    # Ignore 0-9
    # split pattern
    wm = re.compile(r"[^'A-Za-z]|\s")
    ifile = open('hoge.txt')
    dictionary = {}
    for line in ifile:
        words = wm.split(line)
        for word in words:
            if word is '':
                continue
            lword = word.lower()
            dictionary[lword] =  dictionary.get(lword,0) + 1
    ifile.close()
    
    def frequency_sort(a,b):
        i = dictionary[a]
        j = dictionary[b]
        if i > j:
            return -1
        elif j > i:
            return 1
        else:
            return cmp(a,b)
    
    keys = dictionary.keys()
    keys.sort(frequency_sort)
    for d in keys:
        print '%4d:%s' % (dictionary[d],d)
    pass

if __name__ == '__main__':
    main()

コメントを投稿

About

2007年06月11日 18:56に投稿されたエントリーのページです。

ひとつ前の投稿は「お題1:ファイルの同期」です。

次の投稿は「お題3:シングルトン」です。

他にも多くのエントリーがあります。メインページアーカイブページも見てください。

Powered by
Movable Type 3.34