スパース性に基づく機械学習 富岡亮太先生 講談社



#6.3 繰り返し重み付き縮小法 p.72

data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y

eta=rep(1,ncol(X))

mu=10^(-8)

lam=1

w=rep(1,ncol(X))

ite=10^5

for(l in 1:ite){

w_pre=w

cost=100;cost_pre=0

while(abs(cost-cost_pre)>10^(-1))({

cost_pre=cost

w=w-mu*(-2*t(X)%*%(Y-X%*%w)+lam*w/eta)

cost=sum((Y-X%*%w)^2)+lam*sum(w^2/eta)/2

})

cost2=cost+sum(eta)*lam/2

eta=abs(w)

print(cost2)

}


#lasso 再急降下法  ISTA


data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y


mu=10^(-8)

lam=1

w=rep(1,ncol(X))

ite=10^2

cost_vec=c()

for(l in 1:ite){

dw=w-mu*(-2*t(X)%*%(Y-X%*%w))

w=ifelse(abs(dw)<=lam*mu,0,ifelse(dw>lam*mu,dw-lam*mu,dw+lam*mu))

cost=sum((Y-X%*%w)^2)+lam*sum(abs(w))

cost_vec=c(cost_vec,cost)

print(cost)

}

cost_vec_ISTA=cost_vec



#lasso 再急降下法  FISTA


data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y


mu=10^(-8)

lam=1

w=rep(1,ncol(X))

ite=10^2

eta=0

cost_vec=c()

for(l in 1:ite){

b_pre=w

dw=w-mu*(-2*t(X)%*%(Y-X%*%w))

b=ifelse(abs(dw)<=lam*mu,0,ifelse(dw>lam*mu,dw-lam*mu,dw+lam*mu))

eta_pre=eta

eta=(1+sqrt(1+4*eta^2))/2

w=b+((eta_pre-1)/eta)*(b-b_pre)

cost=sum((Y-X%*%w)^2)+lam*sum(abs(w))

print(cost)

cost_vec=c(cost_vec,cost)

}

cost_vec_FISTA=cost_vec

#costの減少具合の違い

plot(c(1:ite),cost_vec_ISTA,xlim=c(1,ite),ylim=c(min(c(cost_vec_ISTA,cost_vec_FISTA)),max(c(cost_vec_ISTA,cost_vec_FISTA))),col=2,xlab="index",ylab="cost")

par(new=T)

plot(c(1:ite),cost_vec_FISTA,xlim=c(1,ite),ylim=c(min(c(cost_vec_ISTA,cost_vec_FISTA)),max(c(cost_vec_ISTA,cost_vec_FISTA))),col=3,xlab="index",ylab="cost")






#カーネルを用いたlasso回帰(近似的にSVM lasso?)

data=data.frame(num=1:20,y=c(0.20, 0.10, 0.49, 0.26, 0.92, 0.95, 0.18, 0.19, 0.44, 0.79, 0.61, 0.41, 0.49, 0.34, 0.62, 0.99, 0.38, 0.22,0.71,0.40),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56),x3=c(24.5,24,23,22.5,22,24,25,26,25,24,23,21,26,25.5,24,23,24,24,24,22),sign=0)

Y=data$y*100

X=as.matrix(data[,colnames(data) %in% c("x1","x2","x3")])

kernel=array(0,dim=c(nrow(X),nrow(X)))

for(j in 1:nrow(X)){

kernel[j,]=exp(-apply((t(X)-c(X[j,]))^2,2,sum)/2)  

}

G=cbind(kernel)

K=t(G)

#lasso 再急降下法  ISTA

X=K

mu=10^(-2)

lam=1

w=rep(1,ncol(X))

ite=10^5

cost_vec=c()

for(l in 1:ite){

dw=w-mu*(-2*t(X)%*%(Y-X%*%w))

w=ifelse(abs(dw)<=lam*mu,0,ifelse(dw>lam*mu,dw-lam*mu,dw+lam*mu))

cost=sum((Y-X%*%w)^2)+lam*sum(abs(w))

cost_vec=c(cost_vec,cost)

print(cost)

}

cost_vec_ISTA=cost_vec




#L1 logistic(lasso)

data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y;Y=ifelse(Y-mean(Y)>0,1,0)

lam=5

obs=function(v){

p=1/(1+exp(-X%*%v))

return(sum(-Y*log(ifelse(p>0,p,1))-(1-Y)*log(ifelse(1-p>0,1-p,1)))+lam*sum(abs(v)))

}

gamma=1/(max(svd(X)$d^2))

w=rep(0.01,ncol(X))

ite=10^5

for(l in 1:ite){

p=1/(1+exp(-X%*%w))  

dw=w-gamma*t(X)%*%(p-Y)

thresh=gamma*lam

dw[dw >= thresh] = dw[dw >= thresh] - thresh

dw[dw <= -thresh] = dw[dw <= -thresh] + thresh

dw[abs(dw)<thresh]=0

w=dw  

observation=obs(w)

print(observation)

}



#L2 logistic(ridge)

data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y;Y=ifelse(Y-mean(Y)>0,1,0)

lam=1

obs=function(v){

p=1/(1+exp(-X%*%v))

return(sum(-Y*log(ifelse(p>0,p,1))-(1-Y)*log(ifelse(1-p>0,1-p,1)))+lam*sum(abs(v^2)))

}

gamma=1/(max(svd(X)$d^2))

w=rep(0.01,ncol(X))

ite=10^5

for(l in 1:ite){

p=1/(1+exp(-X%*%w))  

dw=w-gamma*t(X)%*%(p-Y)

thresh=gamma*lam

dw=dw/(1+2*thresh)

w=dw  

observation=obs(w)

#print(observation)

}



# logistic(ElasticNet)

data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y;Y=ifelse(Y-mean(Y)>0,1,0)

lam1=1;lam2=1

obs=function(v){

p=1/(1+exp(-X%*%v))

return(sum(-Y*log(ifelse(p>0,p,1))-(1-Y)*log(ifelse(1-p>0,1-p,1)))+lam1*sum(abs(v))+lam2*sum(abs(v^2)))

}

gamma=1/(max(svd(X)$d^2))

w=rep(0.01,ncol(X))

ite=10^5

for(l in 1:ite){

p=1/(1+exp(-X%*%w))  

dw=w-gamma*t(X)%*%(p-Y)

thresh=gamma*lam1

dw[dw >= thresh] = dw[dw >= thresh] - thresh

dw[dw <= -thresh] = dw[dw <= -thresh] + thresh

dw[abs(dw)<thresh]=0

thresh=gamma*lam2

dw=dw/(1+2*thresh)

w=dw  

observation=obs(w)

print(observation)

}





#L2 logistic(ridge) Fisher-score method

data=data.frame(num=1:20,y=c(167,167.5,168.4,172,155.3,151.4,163,174,168,160.4,164.7,171,162.6,164.8,163.3,167.6,169.2,168,167.4,172),x1=c(84,87,86,85,82,87,92,94,88,84.9,78,90,88,87,82,84,86,83,85.2,82),x2=c(61,55.5,57,57,50,50,66.5,65,60.5,49.5,49.5,61,59.5,58.4,53.5,54,60,58.8,54,56))

X=as.matrix(data[,colnames(data) %in% c("x1","x2")])

X=cbind(rep(1,nrow(X)),X)

Y=data$y;Y=ifelse(Y-mean(Y)>0,1,0)

n=length(Y);m=ncol(X)

lam=0.01;K=diag(rep(1,m))

obs=function(v){

p=1/(1+exp(-X%*%v))

return(sum(Y*log(ifelse(p>0,p,1))+(1-Y)*log(ifelse(1-p>0,1-p,1)))-lam*n*as.numeric(t(c(v))%*%K%*%t(t(c(v))))/2)

}

gamma=1/(max(svd(X)$d^2))

w=rep(0.01,ncol(X))

ite=10^5

for(l in 1:ite){

p=1/(1+exp(-X%*%w))  

pi=diag(c(p))

lamda=diag(c(Y-pi))

w=solve(t(X)%*%pi%*%(diag(rep(1,n))-pi)%*%X+n*lam*K)%*%t(X)%*%pi%*%(diag(rep(1,n))-pi)%*%(X%*%w+solve(pi%*%(diag(rep(1,n))-pi))%*%(Y-p))

print(obs(w))

}