数量化理論とデータ処理 駒沢勉先生 朝倉書店



#数量化1類 p.10~

library(dummies)

income=c(1,1,2,3,1,2,2,1,3,1,3,2,2,1,3,2,1,2,3,2)

income=dummy(income)

colnames(income)=c("low","normal","high")

job=c(1,3,1,4,4,3,1,3,2,1,1,3,1,3,2,4,2,1,1,2)

job=dummy(job)

colnames(job)=c("office_worker","management","labor","others")

new_or_not=c(1,2,1,2,1,2,1,1,2,2,1,1,2,2,1,1,2,2,1,1)

new_or_not=dummy(new_or_not)

colnames(new_or_not)=c("new","not")

ad=c(2,1,1,2,1,2,1,2,2,1,1,2,2,2,1,1,1,1,2,1)

ad=dummy(ad)

colnames(ad)=c("look","not")

mass=c(1,2,1,2,1,2,1,2,3,2,3,1,2,1,3,3,1,3,1,2)

mass=dummy(mass)

colnames(mass)=c("news_paper","TV","others")

price=c(2,1,5,6,3,2,6,3,5,2,01,4,3,1,9,10,3,4,6,6)

pocket=c(10,8,18,30,15,12,22,12,25,8,20,10,14,6,28,8,10,25,28,28)

Y=cbind(price,pocket)

X=cbind(income,job[,-4],ad[,-1],mass[,-3])

A=t(X)%*%X

beta=solve(A)%*%t(X)%*%Y

Y_hat=X%*%beta

cor=diag(cor(Y,Y_hat))

#t-testによる特徴量の確認 p.36

X=cbind(income,job,ad,mass)

x_ave=(t(X)%*%Y)*c(1/apply(X,2,sum))

A=array(0,dim=c(ncol(X),2))

n=apply(X,2,sum)

for(j in 1:ncol(X)){

A[j,]=(apply(t(Y*X[,j])^2,1,sum)-n[j]*x_ave[j,]^2)

}

t_test=function(X_ave,A2,N){

mat=array(0,dim=c(length(X_ave),length(X_ave)))

pthi=mat;no=mat;qts=mat

for(i in 1:dim(mat)[1]){
for(j in 1:dim(mat)[2]){

mat[i,j]=abs(X_ave[i]-X_ave[j])/sqrt(A2[i]/(N[i]*(N[i]-1))+A2[j]/(N[j]*(N[j]-1)))  

C=(A2[i]/(N[i]*(N[i]-1)))/(A2[i]/(N[i]*(N[i]-1))+A2[j]/(N[j]*(N[j]-1)))

pthi[i,j]=C^2/(N[i]-1)+(1-C)^2/(N[j]-1)

no[i,j]=paste0("(",i,",",j,")")

qts[i,j]=qt(1-0.05/2,1/(C^2/(N[i]-1)+(1-C)^2/(N[j]-1)))

}  
}

pthi=1/pthi

res=cbind(c(no),c(mat),c(pthi),c(qts))

colnames(res)=c("elements","t","dim","qt_0.05/2")

return(res)

}

t_test(x_ave[4:7,2],A[4:7,2],n[4:7])






#ミニマックス的中率

library(dplyr)

data=data.frame(group1=c("G1","G1","G1","G2","G2"),group2=c("G11","G12","G13","G21","G22"),C11=c(4,3,0,4,3),C12=c(3,3,2,5,3),C13=c(1,1,3,2,3),C21=c(4,2,2,5,3),C22=c(1,1,2,3,1),C23=c(2,3,0,1,4),C24=c(1,1,1,2,1),C31=c(6,2,3,7,4),C32=c(2,5,2,4,5))

n_G1=c(8,7,5);n_G2=c(11,9);N=sum(n_G1,n_G2)

pis=c(sum(n_G1)/N,sum(n_G2)/N)

library(stringr)



mu1=data %>% filter(group1=="G1");mu1=mu1[,str_count(colnames(mu1),"group")==0]

f1=mu1

for(j in 1:3){

no=paste0("C",j)  

f1=f1[,!(colnames(f1) %in% paste0(no,"1"))]

}

no_s=apply(mu1,2,sum)

sigma1=sd(as.matrix(mu1))

mu1=mean(as.matrix(mu1))



mu2=data %>% filter(group1=="G2");mu2=mu2[,str_count(colnames(mu2),"group")==0]

f2=mu2

for(j in 1:3){

no=paste0("C",j)  

f2=f2[,!(colnames(f2) %in% paste0(no,"1"))]

}

sigma2=sd(as.matrix(mu2))

mu2=mean(as.matrix(mu2))



sigma_t=sd(as.matrix(data[,str_count(colnames(data),"group")==0]))



alpha_dash=abs(mu1-mu2)/(sigma1+sigma2)

p_star=pnorm(alpha_dash)-pnorm(0)+0.5

sigma_B=sqrt(prod(pis)*(mu2-mu1)^2)

eta2=(sigma_B^2)/(sigma_t^2)

#alpha_dash

(sigma_t/(sigma1+sigma2))*sqrt(eta2/prod(pis))


data2=data.frame(group1=c("X1","X1","X1","X2","X2","X2","X2","X3","X3"),group2=c("C11","C12","C13","C21","C22","C23","C24","C31","C32"),C11=c(7,0,0,2,1,3,1,3,4),C12=c(0,8,0,4,1,2,1,5,3),C13=c(0,0,5,2,2,0,1,3,2),C21=c(2,4,2,8,0,0,0,5,3),C22=c(1,1,2,0,4,0,0,2,2),C23=c(3,2,0,0,0,5,0,2,3),C24=c(1,1,1,0,0,0,3,2,1),C31=c(3,5,3,5,2,2,2,11,0),C32=c(4,3,2,3,2,3,1,0,9))

H=as.matrix(data2[,str_count(colnames(data2),"group")==0])

no_s_data=data.frame(num=c(1:length(no_s)),no_s=names(no_s))

no_num=c()

for(j in 1:nrow(no_s_data)){

no=no_s_data$no_s[j]  

if(substring(no,3)=="1"){

no_num=c(no_num,j)    

}

}

H=H[-no_num,-no_num]

A1=array(0,dim=c(ncol(f1),ncol(f1)))

B1=array(0,dim=c(ncol(f1),ncol(f1)))

for(i in 1:ncol(f1)){

  p_i=c(f1[,i]/n_G1,-diag(H)[i]/sum(n_G1))

  a_vec=c();b_vec=c()

for(j in 1:ncol(f1)){  

q_j=c(f1[,j],diag(H)[j])  

a=sum(p_i*q_j);b=H[i,j]-diag(H)[i]*diag(H)[j]/sum(n_G1)

a_vec=c(a_vec,a);b_vec=c(b_vec,b)

}  

A1[i,]=a_vec  

B1[i,]=b_vec

}

P=c(1/n_G2);P[length(P)]=P[length(P)]*(-1)

d=apply(f2*P,2,sum)

solve(B1,d)



#3.7 的中率pを求める数量化 p.54~p.68

#mu2>=mu1

n1=80;n2=100;n=n1+n2

x1=rnorm(n1,-1,2)

x2=rnorm(n2,8,3)

mu1=mean(x1);mu2=mean(x2)

sd1=sd(x1);sd2=sd(x2);sd_t=sd(c(x1,x2))

f=function(a){

return(pnorm(a,mu1,sd1)-(1-pnorm(a,mu2,sd2)))

}

ite=100

eta=0.1

x=1

h=0.01

for(l in 1:ite){

df=(f(x+h)-f(x))/h

x=x-eta*f(x)/df

print(f(x))

}

x0=(sd2*mu1+sd1*mu2)/(sd1+sd2)

#ミニマックス的中率

x0_seiki=abs(mu1-mu2)/(sd1+sd2)

p=pnorm(abs(mu1-mu2)/(sd1+sd2))

alpha1=(mu2-mu1)/(sd1+sd2)

alpha2=(mu1-mu2)/(sd1+sd2)

pi1=n1/n;pi2=n2/n

#sd_Bは分離度を表す。
sd_B=sqrt(pi1*pi2*(mu1-mu2)^2)

eta2=sd_B^2/(sd_t^2)

plot(c(1:length(x1)),x1,xlim=c(1,max(c(n1,n2))),ylim=c(min(c(x1,x2)),max(c(x1,x2))),xlab="index",ylab="values",col=2)

par(new=T)

plot(c(1:length(x2)),x2,xlim=c(1,max(c(n1,n2))),ylim=c(min(c(x1,x2)),max(c(x1,x2))),xlab="index",ylab="values",col=3,main=paste0("ミニマックス的中率:",round(p,3),",","相関比:",round(eta2,3)))



#数量化理論とデータ処理 p.71 グラフプロットでの確認

library(dplyr)

L=9

p=sample(1:9,L);p=p/sum(p)

q=sample(1:9,L);q=q/sum(q)

c_vec=seq(1,10,0.0001)

plot_data=data.frame(c=c_vec,tau=0,uv=0)

for(l in 1:length(c_vec)){

c=c_vec[l]

u=((sum((p+q)*p/(p-q*c))*(1+c)/2)/(1-sum((p-q*c)*p/(p+q*c))))

if(u<0){

#u=sqrt(u)  

plot_data$tau[l]=NaN

plot_data$uv[l]=NaN 

}else{

u=sqrt(u)  

x=((p-q)*(1+c)/(2*u)+u*(p-q*c))/(p+q*c)

x=x-mean(x)

sig_p=sum(p*(x-u)^2)

sig_q=sum(q*(x-(-u))^2)

tau=sig_p/sig_q

lam=2*u/(1+c)

mu=2*u*c/(1+c)

plot_data$tau[l]=tau

plot_data$uv[l]=2*u

}

#c=mu/lam

#print(tau)  

}

plot_data=plot_data%>%filter(is.nan(tau)!=T)

plot(plot_data$c,plot_data$tau,col=2,xlab="c",ylab="tau",main="")

c=plot_data$c[abs(plot_data$tau-1)==min(abs(plot_data$tau-1))]

u=sqrt((sum((p+q)*p/(p-q*c))*(1+c)/2)/(1-sum((p-q*c)*p/(p+q*c))))

x=((p-q)*(1+c)/(2*u)+u*(p-q*c))/(p+q*c)

x=x-mean(x)

sig_p=sum(p*(x-u)^2)

sig_q=sum(q*(x-(-u))^2)

tau=sig_p/sig_q

lam=2*u/(1+c)

mu=2*u*c/(1+c)

mean(x)


#数量化理論とデータ処理 準ニュートン法

library(dplyr)

L=5

p=sample(1:9,L);p=p/sum(p)

q=sample(1:9,L);q=q/sum(q)

#Broyden Quasi-Newton Algorithm p.213

f=function(x){

lambda=x[1];mus=x[2];x=x[-c(1:2)]  

c=mus/lambda

f1=sum(p*(x-sum(p*x))^2)-1

f2=sum(q*(x-sum(q*x))^2)-1

U=(lambda+mus)/2

f3=(p-q)-lambda*(p*x-p*U)-mus*(q*x-q*(-U))

return(c(f1,f2,f3))

}

X=rep(1,L+2)

ite=10^5

eta=0.001

H=diag(f(X))

for(l in 1:ite){

if(sum(abs(f(X)))>10^(-9)){  

X_pre=X  

X=X-eta*H%*%f(X)  

#X[-c(1:2)]=X[-c(1:2)]-mean(X[-c(1:2)])

s=X-X_pre

y=f(X)-f(X_pre)

H=H+((s-H%*%y)/as.numeric(t(s)%*%H%*%y))%*%t(s)%*%H

#print(paste0(sum(X[1:2]),",",sum(abs(f(X)))))

#print(sum(abs(f(X))))

}

}  


#数量化3類 p.90~

data=data.frame(パターン=1:13,個数=c(1,3,2,5,3,2,4,3,1,1,3,1,1),無口である=c(0,0,0,0,0,0,0,1,0,1,0,0,0),生真面目である=c(1,0,0,0,1,0,0,0,0,1,0,0,0),空想を好む=c(1,1,0,0,0,0,1,0,0,0,1,0,1),活発である=c(1,0,0,1,0,0,1,0,0,0,1,1,1),冗談をいう=c(1,1,0,1,0,1,1,0,1,0,0,0,0),きれい好きである=c(0,0,1,0,1,0,0,1,1,0,0,0,0),粘り強い=c(0,0,0,0,1,0,0,1,1,1,0,1,1))

delta=as.matrix(data[,-c(1:2)])

t=sum(delta)

n=nrow(delta);m=ncol(delta)

c=apply(delta,2,sum)

d=apply(delta,1,sum)

C=diag(c);D=diag(d)

X=t(delta)

A=solve(sqrt(C))%*%X%*%solve(D)%*%t(X)%*%solve(sqrt(C))

lam=eigen(A)$values

z=eigen(A)$vectors

#カテゴリウエイト
u=solve(sqrt(C))%*%z[,2]*sqrt(t)

#個体ウエイト
v=lam[2]*sqrt(t)*solve(D)%*%t(X)%*%solve(sqrt(C))%*%z[,2]

#カテゴリ得点
y=X%*%v

#個体得点
w=t(X)%*%u

#寄与率
lam[-1]/sum(lam[-1])