李宏毅-機械学習HW 2

1130 ワード

二分類問題ですが、ここではlogistic regressionを使用しています.
f=open(r'C:\Users\Lenovo\Desktop\HW2\spam_train.csv')
data=list(csv.reader(f))
Data=np.array([list(map(float,i)) for i in data])
Data=Data[:,1:]
X=Data[:,:-1]
y=Data[:,-1]
X=np.array(X,dtype=float).reshape(-1,57)
y=np.array(y,dtype=float).reshape(-1)
X=np.hstack([X,np.ones((len(X),1))])
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8)

def logistic_regression(X,y,W,iters,lr):
    grad=np.zeros(W.shape)
    for i in range(iters):
        s=X.dot(W)
        s=1.0/(1.0+np.exp(-1*s))
        loss1=y-s
        d=np.dot(-1*X.T,loss1)/len(X)
        grad=grad+d**2
        W=W-lr*d/np.sqrt(grad)
    return W
w = np.zeros(len(X_train[0]))
W=logistic_regression(X_train,y_train,w,20000,0.9)

def test(W,X):
    num=len(X)
    y_pre=[]
    for i in range(num):
        s=np.sum(X[i]*W)
        s=1.0/(1.0+np.exp(-1*s))
        if s>0.5:
            y_pre.append(1)
        else:
            y_pre.append(0)
    return y_pre
y_pre=test(W,X_test)
np.sum(y_pre==y_test)/len(y_test)