環境と健康データ ーリスク評価のデータサイエンスー 柳川先生



#p.91 5.2.3 ロジスティック回帰

用量=c(0.8,1.4,3.3,3.9,7.5,11.2,11.8,14.5,31)

death=c(1,2,0,1,10,47,46,60,60)

n=rep(60,9)

survive=n-death

data=data.frame(用量,death,survive,n)

Y=death

X=as.matrix(cbind(rep(1,length(用量)),用量))

#ロジスティック回帰(二項分布~ロジット変換) フィッシャースコア法

X=as.matrix(X)

alpha=rep(0.01,(ncol(X)))

ite=10000

for(l in 1:ite){

p=1/(1+exp(-X%*%alpha))

V=array(0,dim=c(nrow(X),nrow(X)))

diag(V)=(n*p*(1-p))

mat=solve(t(X)%*%V%*%(X))%*%t(X)%*%(Y-n*p)

alpha=alpha+mat

}

res=data.frame(predict=n*p,Y=Y)

V_alpha=solve(t(X)%*%V%*%(X))


pd=p

#図5.4

plot(用量,pd,xlab="用量",ylab="pd",type="o")

#50%の動物を死亡させる用量

LD50_hat=-c(alpha)[1]/c(alpha)[2]

sigma_hat=V_alpha[1,1]/(alpha[1]^2)+V_alpha[2,2]/(alpha[2]^2)+(-2)*V_alpha[1,2]/prod(alpha)

#log(LD50_hat)の両側5%の信頼区間

und=log(LD50_hat)-qnorm(1-0.05/2)*sqrt(sigma_hat)

up=log(LD50_hat)+qnorm(1-0.05/2)*sqrt(sigma_hat)




#p.136 コクランアーミテージ

d=c(0,0.05,0.1,0.25,0.5)

n=c(196,127,130,119,113)

y=c(237,218,282,269,426)

d_mat=cbind(rep(1,length(d)),d)

#poisson reg 最尤推定量

b=rep(1,2)

ite=10^(3)

eta=0.001

for(l in 1:ite){

db0=sum(y)-sum(n*exp(d_mat%*%b))  

db1=sum(y*d)-sum(d*n*exp(d_mat%*%b))

db=c(db0,db1)

b=b+eta*db

loglik=sum((d_mat*c(y))%*%b-n*exp(d_mat%*%b))

print(loglik)

}

d_ave=sum(n*d)/sum(n)

y_plus=sum(y)/sum(n)

ZCA=sum((d-d_ave)*y)/sqrt(y_plus*sum(n*(d-d_ave)^2))

qnorm(1-0.05/2)


#キノリンデータ p.140

d=c(0,10,33,100,333)

d_ave=mean(d)

yij=matrix(c(15,21,29,16,18,21,16,26,33,27,41,60,33,38,41),ncol=5)

y=apply(yij,2,sum);n=rep(3,ncol(yij))

d_mat=cbind(rep(1,length(d)),d)

#poisson reg 最尤推定量

b=rep(0.01,2)

ite=10^(6)

eta=10^(-7)

for(l in 1:ite){

db0=sum(y)-sum(n*exp(d_mat%*%b))  

db1=sum(y*d)-sum(d*n*exp(d_mat%*%b))

db=c(db0,db1)

b=b+eta*db

loglik=sum((d_mat*c(y))%*%b-n*exp(d_mat%*%b))-sum(log(factorial(yij)))

#print(loglik)

}

2*sum(y*log(y/exp(d_mat%*%b))-(y-exp(d_mat%*%b)))

d_ave=sum(n*d)/sum(n)

y_plus=sum(y)/sum(n)

ZCA=sum((d-d_ave)*y)/sqrt(y_plus*sum(n*(d-d_ave)^2))

qnorm(1-0.05/2)

#超過変動の検定(ポアソン回帰による変動より大きいかどうか)

kai2=sum((t(yij)-apply(yij,2,mean))^2/apply(yij,2,mean))

v=sum(n-1)

#kai2がqchisqよりおおきければ、ポアソン変動に従わない

kai2>qchisq(1-0.05,df=v)


#キノリンデータ p.140

#負の二項回帰モデル

d=c(0,10,33,100,333)

d_ave=mean(d)

yij=matrix(c(15,21,29,16,18,21,16,26,33,27,41,60,33,38,41),ncol=5)

y=apply(yij,2,sum);n=rep(3,ncol(yij))

d_mat=cbind(rep(1,length(d)),d)

# 最尤推定量

b=rep(1,2)

r=30

ite=10^(6)

eta=10^(-6)

eta_r=10^(-4)

for(l in 1:ite){

b_pre=b+1

while(sum(abs(b_pre-b))>10^(-5))({

b_pre=b

db=-t(d_mat)%*%((y+r)*(exp(d_mat%*%b)/(exp(d_mat%*%b)+r)))+t(d_mat)%*%y

b=b+eta*db

loglik=-sum((y+r)*log(exp(d_mat%*%b)+r))+sum(y*d_mat%*%b)+r*log(r)+sum(log(gamma(y+r)))-log(gamma(r))-sum(log(factorial(y)))

print(loglik)

})

r_pre=r+1

while(sum(abs(r_pre-r))>10^(-5))({

r_pre=r

dr=sum(-log(exp(d_mat%*%b)+r)-(y+r)/(exp(d_mat%*%b)+r))+log(r)+1+sum(digamma(y+r))-digamma(r)

r=r+eta*dr

loglik=-sum((y+r)*log(exp(d_mat%*%b)+r))+sum(y*d_mat%*%b)+r*log(r)+sum(log(gamma(y+r)))-log(gamma(r))-sum(log(factorial(y)))

print(loglik)

})

loglik=-sum((y+r)*log(exp(d_mat%*%b)+r))+sum(y*d_mat%*%b)+r*log(r)+sum(log(gamma(y+r)))-log(gamma(r))-sum(log(factorial(y)))

print(loglik)

}


mu=exp(d_mat%*%b)

2*sum((y+r)*log((mu+r)/(y+r))+y*log(y/mu))

w=(mu+r)/(mu*r)

#回帰変数の分散共分散行列
solve(t(d_mat)%*%solve(diag(c(w)))%*%d_mat)




#p.176 タロンの傾向性検定

data1=data.frame(t=4,対照群=c(0,5),低用量群=c(0,5),高用量群=c(1,4))

data2=data.frame(t=6,対照群=c(0,5),低用量群=c(1,3),高用量群=c(0,3))

data3=data.frame(t=7,対照群=c(0,5),低用量群=c(0,3),高用量群=c(2,1))

data4=data.frame(t=8,対照群=c(1,4),低用量群=c(1,2),高用量群=c(0,1))

n=rbind(apply(data1,2,sum),apply(data2,2,sum),apply(data3,2,sum),apply(data4,2,sum))

n=as.matrix(n[,-1])

d_plus=c(apply(data1[,-1],1,sum)[1],apply(data2[,-1],1,sum)[1],apply(data3[,-1],1,sum)[1],apply(data4[,-1],1,sum)[1])

n_plus=c(sum(data1[,-1]),sum(data2[,-1]),sum(data3[,-1]),sum(data4[,-1]))

d=as.matrix(rbind(data1[1,-1],data2[1,-1],data3[1,-1],data4[1,-1]))

#勾配上昇法

#用量
C=c(1:ncol(d))/1000

#生存の傾向
beta=0.01

ite=10^6

eta=10^(-5)

loglik_vec=c()

for(l in 1:ite){

dbeta=sum(d%*%C)-sum(d_plus*n%*%c(C*exp(beta*C))/(n%*%c(exp(beta*C))))

dC=apply(d,2,sum)-c((t(n)*c(beta*exp(beta*C)))%*%c(d_plus/n%*%c(exp(beta*C))))

beta=beta+eta*dbeta

C=C+eta*dC

loglik=sum(beta*d%*%C)-sum(d_plus*log(n%*%c(exp(beta*C))))

print(loglik)

loglik_vec=c(loglik_vec,loglik)

}

plot(loglik_vec)


C_hat=C


C=c(0,1,5)

#平均

E_dij=n*c(d_plus/n_plus)

VT=sum((d_plus*(n_plus-d_plus)/(n_plus-1))*(apply(t(t(n)*C^2)/n_plus,1,sum)-apply(t(t(n)*C)/n_plus,1,sum)^2))

ZTA=sum((d-E_dij)%*%C)/sqrt(VT)