ロジスティック回帰分析ーSASを使った統計解析の実際



#SMR model p.119

I=5;J=9

d_mat=array(0,dim=c(I,J))

n_mat=array(0,dim=c(I,J))

a=rep(1,I);b=rep(1,J)

for(i in 1:I){

val=rpois(J,sample(10:100,1))  

n_mat[i,]=n_mat[i,]+val

}

for(j in 1:J){

val=rpois(I,sample(10:100,1))  

n_mat[,j]=n_mat[,j]+val  

}

for(i in 1:I){
for(j in 1:J){  

d_mat[i,j]=n_mat[i,j]-sample(1:(n_mat[i,j]-1),1)

}
}

ite=100

mu=1

for(l in 1:ite){

a_pre=a;b_pre=b  

di_plus=apply(d_mat,1,sum)

a=log(di_plus/n_mat%*%exp(b))

dj_plus=apply(d_mat,2,sum)

b=log(dj_plus/t(n_mat)%*%exp(a))

print(sum(abs(a_pre-a))+sum(abs(b_pre-b)))

pij=d_mat/n_mat;log_pij=log(pij)

predict=array(0,dim=c(I,J))

for(i in 1:I){
for(j in 1:J){  

predict[i,j]=a[i]+b[j]   

}
}

mu=mean(log_pij-predict)

predict=predict+mu

print(sum(abs(pij-exp(predict))))

}





#binomial logistic p.183

data=data.frame(num=1:20,y=c(0.20, 0.10, 0.49, 0.26, 0.92, 0.95, 0.18, 0.19, 0.44, 0.79, 0.61, 0.41, 0.49, 0.34, 0.62, 0.99, 0.38, 0.22,0.71,0.40),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56),x3=c(24.5,24,23,22.5,22,24,25,26,25,24,23,21,26,25.5,24,23,24,24,24,22))

n_vec=rpois(nrow(data),sample(10:30,1))

d_vec=c()

for(j in 1:length(n_vec)){

d_vec=c(d_vec,sample(1:(n_vec[j]-1),1))  

}

X=as.matrix(cbind(rep(1,nrow(data)),data[,colnames(data) %in% c("x1","x2")]))

pis=data$y

alpha=rep(0.01,(ncol(X)))

ite=10

alpha_beta_data=array(0,dim=c(nrow(X),ncol(X)))

for(j in 1:ite){

V=array(0,dim=c(nrow(data),nrow(data)))

diag(V)=(n_vec*(1/(1+exp(-(X%*%alpha))))*(1-1/(1+exp(-(X%*%alpha)))))

print(sum(V))



mat=solve(t(X)%*%V%*%(X))%*%t(X)%*%(d_vec-n_vec*(1/(1+exp(-(X%*%alpha)))))

alpha=alpha+mat

alpha_beta_data[j,]=alpha

}

data=data %>% dplyr::mutate(d=d_vec,predict=n_vec*(1/(1+exp(-(X%*%alpha)))))


#solution2 p.193

V=array(0,dim=c(nrow(data),nrow(data)))

diag(V)=(n_vec*(1/(1+exp(-(X%*%alpha))))*(1-1/(1+exp(-(X%*%alpha)))))

q_vec=n_vec*(1/(1+exp(-(X%*%alpha))))

z=X%*%alpha+solve(V)%*%(d_vec-q_vec)

beta=solve(t(X)%*%V%*%X)%*%t(X)%*%V%*%z

X2=sum((d_vec-q_vec)^2/diag(V))


#層別解析 p.204

library(dummies)

data=data.frame(num=1:20,y=sample(0:1,20,replace=T,prob=c(0.3,0.7)),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56),x3=c(24.5,24,23,22.5,22,24,25,26,25,24,23,21,26,25.5,24,23,24,24,24,22),e=c(rep(1,10),rep(2,5),rep(3,5)))

E=dummy(data$e)

X=as.matrix(data[,colnames(data) %in% c("x1","x2","x3")])

X=t((t(X)-apply(X,2,mean))/apply(X,2,sd))

Y=data$y

beta=rep(1,ncol(X));r=rep(1,ncol(E))

ite=300000;eta=0.00001

loglik_vec=c()

for(l in 1:ite){

for(i in 1:length(beta)){

beta_sub=beta;

beta_sub[i]=beta_sub[i]+0.01  

loglik1=as.numeric(beta_sub%*%t(X)%*%Y+r%*%t(E)%*%Y)-sum(log(1+exp(beta_sub%*%t(X)+r%*%t(E))))

loglik2=as.numeric(beta%*%t(X)%*%Y+r%*%t(E)%*%Y)-sum(log(1+exp(beta%*%t(X)+r%*%t(E))))

beta[i]=beta[i]+eta*(loglik1-loglik2)/0.01  

}  

for(i in 1:length(r)){

r_sub=r;

r_sub[i]=r_sub[i]+0.01  

loglik1=as.numeric(beta%*%t(X)%*%Y+r_sub%*%t(E)%*%Y)-sum(1+exp(beta%*%t(X)+r_sub%*%t(E)))

loglik2=as.numeric(beta%*%t(X)%*%Y+r%*%t(E)%*%Y)-sum(1+exp(beta%*%t(X)+r%*%t(E)))

r[i]=r[i]+eta*(loglik1-loglik2)/0.01  

}  

log_lik=as.numeric(beta%*%t(X)%*%Y+r%*%t(E)%*%Y)-sum(1+exp(beta%*%t(X)+r%*%t(E)))  

#print(log_lik)

loglik_vec=c(loglik_vec,log_lik)

}

#条件付尤度の最大化 

library(dummies)

data=data.frame(num=1:20,x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56),x3=c(24.5,24,23,22.5,22,24,25,26,25,24,23,21,26,25.5,24,23,24,24,24,22),e=sample(1:3,20,replace=T))

E=data$e

class=sort(unique(E))

X=as.matrix(data[,colnames(data) %in% c("x1","x2","x3")])/10

beta=rep(0.01,ncol(X))

eta=10^(-1)

ite=10

for(l in 1:ite){
dbeta=rep(0,length(beta)) 
ddbeta=array(0,dim=c(length(beta),length(beta)))
loglik=0
prob=c()
for(i in class){
values=apply(X[E==i,]*c(exp(X[E==i,]%*%beta)/sum(exp(X[E==i,]%*%beta))),2,sum)  
dbeta=dbeta+apply(t(t(X[E==i,])-c(values)),2,sum)  
ddbeta=ddbeta+t(t(c(values)))%*%t(c(values))/(sum(exp(X[E==i,]%*%beta))^2)-t(X[E==i,])%*%(X[E==i,]*c(exp(X[E==i,]%*%beta)))/sum(exp(X[E==i,]%*%beta))
loglik=loglik+sum(log(exp(X[E==i,]%*%beta)/sum(exp(X[E==i,]%*%beta))))
prob=c(prob,exp(X[E==i,]%*%beta)/sum(exp(X[E==i,]%*%beta)))
}  
beta=beta+eta*solve(-ddbeta)%*%dbeta
print(loglik)
}




#logisticにおけるクラス分類

library(dplyr);library(dummies)

data=data.frame(iris)

data$Species=as.integer(data$Species)

Y=dummy(data$Species)

X=data[,!(colnames(data) %in% c("Species"))]

alpha_box=array(0,dim=c(length(unique(data$Species)),ncol(X)+1))

log_lik=c()

X=as.matrix(cbind(rep(1,nrow(X)),X))

p_value=array(0,dim=c(nrow(data),ncol(Y)))

ite=20

for(i in 1:length(unique(data$Species))){

alpha=rep(0.01,ncol(X));pis=Y[,i]

for(j in 1:ite){

p=1/(1+exp(-X%*%alpha))

V=array(0,dim=c(nrow(data),nrow(data)))

diag(V)=p*(1-p)

print(sum(V))

mat=solve(t(X)%*%V%*%(X))%*%t(X)%*%(pis-p)

alpha=alpha+mat


}

p_vec=p;inv_p_vec=1-p_vec

p_vec[p_vec==0]=(10^(-20));inv_p_vec[inv_p_vec==0]=(10^(-20))

log_lik=c(log_lik,as.numeric(pis*log(p_vec)+(1-pis)*log(inv_p_vec)))

alpha_box[i,]=alpha

p_value[,i]=p

}

colnames(p_value)=c("p1","p2","p3")

data=data.frame(data,p_value)


#p.143 MH test

option=1

pthi_up=c();pthi_und=c()

a=c(30,40,20,10)

b=c(40,20,10,34)

c=c(30,20,10,15)

d=c(20,22,34,56)

up=c();und=c()

for(j in 1:4){

mat=array(0,dim=c(2,2))

mat[1,1]=a[j];mat[1,2]=b[j];mat[2,1]=c[j];mat[2,2]=d[j]

N=apply(mat,2,sum);M=apply(mat,1,sum)

if(option==1){

pthi_up=c(pthi_up,(a[j]*d[j]/t))

pthi_und=c(pthi_und,(b[j]*c[j]/t))

}else{

pthi_up=c(pthi_up,a[j]*M[2]/t)

pthi_und=c(pthi_und,c[j]*M[1]/t)

}

t=sum(mat)

w=ifelse(option==1,1/(t-1),ifelse(option==2,1/t,1/N[2]))

up=c(up,a[j]-M[1]*N[1]/t)

und=c(und,w*prod(N)*prod(M)/(t^2))

}

kai=sum(up)^2/sum(und)

qnorm(1-0.01)

pthi=sum(pthi_up)/sum(pthi_und)

#confidence interval

#under
pthi^(1-qnorm(1-0.01/2)/kai)

#upper
pthi^(1+qnorm(1-0.01/2)/kai)



library(dplyr)

#bernoulli-logistic regression p.181

#multi dimension

library(dplyr)

data=data.frame(num=1:20,y=c(0.20, 0.10, 0.49, 0.26, 0.92, 0.95, 0.18, 0.19, 0.44, 0.79, 0.61, 0.41, 0.49, 0.34, 0.62, 0.99, 0.38, 0.22,0.71,0.40),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56),x3=c(24.5,24,23,22.5,22,24,25,26,25,24,23,21,26,25.5,24,23,24,24,24,22),sign=0)

for(j in 1:nrow(data)){

data$sign[j]=ifelse(data$y[j]>0.49,1,0)

}

X=as.matrix(cbind(rep(1,nrow(data)),data[,colnames(data) %in% c("x1","x2")]))

pis=as.numeric(data$sign)

n=sample(c(10:20),nrow(data),replace=T)

d=c()

for(j in 1:length(n)){

d=c(d,sample(c(1:n[j]),1))

}

alpha=rep(0.01,(ncol(X)))

ite=10000

eta=0.01

alpha_beta_data=array(0,dim=c(ite,ncol(X)))

for(j in 1:ite){

p=(1/(1+exp(-(X%*%alpha))))

V=diag(c(n*p*(1-p)))

mat=solve(t(X)%*%V%*%(X))%*%t(X)%*%(d-n*p)

alpha=alpha+eta*mat

alpha_beta_data[j,]=alpha

log_lik=sum(d*log(p))+sum((n-d)*log(1-p))

print(log_lik)

}

data=data %>% dplyr::mutate(predict=1/(1+exp(-(X%*%alpha))))

#Pearson kai2 static

#H:ロジスティク回帰モデルがデータにフィットしている

X2=sum((d-n*p)^2/(n*p*(1-p)))

qchisq(1-0.01,df=nrow(X)-ncol(X)-1)