PythonでPCA(主成分分析)
作ったんで載せてみます。outputのところは自分の見たいものが何かによって違うだろうし、そもそも第一主成分軸だけが欲しいならソートしなくてもいいですね。その辺は読んで理解して自分で書き換えてください。ちなみにコメント込み87行で、実行時間のほとんどはsummationのところで消費されます(もちろんデータのサイズによって違うでしょうけど)
#
# PCA_TSV:
# Pricipal Component Analysis of Tab Separated Values
#
#
# settings
datafile="methano.fna.FI"
#
# load datafile
print "loading data"
data=[]
for line in file(datafile):
items=line.split()
data.append([float(x) for x in items])
N=len(data)
DIM=len(data[0])
#
# make covariance matrix
# init average list
ave=[0.0] * DIM
# init covar matrix
covar=[]
for i in range(DIM):
covar.append(
[0.0] * DIM
)
# summation
print "summating"
for d in data:
for i in range(DIM):
for j in range(i, DIM):
covar[i][j] += d[i] * d[j]
ave[i] += d[i]
# calc average
for i in range(DIM):
ave[i] /= N
# calc covariance
for i in range(DIM):
for j in range(i, DIM):
v = covar[i][j]/N - ave[i] * ave[j]
covar[i][j] = v
covar[j][i] = v
#
# diagonalise the covar matrix
print "diagonalising"
from numarray import array
import numarray.linear_algebra as la
(eigenval, eigenvec) = la.eigenvectors(array(covar))
#
# sort eigenvectors by eigenvalue
print "sorting"
def inv_cmp(x, y):
return cmp(y, x)
axis=zip(eigenval, eigenvec)
axis.sort(inv_cmp)
sumVar=sum(eigenval)
#
# output
fo=file("pca.txt", "w")
for i in range(10):
(v, vec) = axis[i]
print v/sumVar
fo.write(
"\t".join([str(x) for x in vec])
)
fo.write("\n")
fo.close()