主線1.2 FMアルゴリズムのPython実現

21327 ワード

アルゴリズムの原理部分はブログの主線1.1 FMアルゴリズムの原理の詳しい解を見ることができて、本文は直接コードに上がって、関連する説明はすべてコードの注釈の中にあります.
# coding:UTF-8

from __future__ import division
from math import exp
from numpy import *
from random import normalvariate  #     
from datetime import datetime

trainData = 'E://data//diabetes_train.txt'
testData = 'E://data//diabetes_test.txt'
featureNum = 8


def loadDataSet(data):  #        
    dataMat = []
    labelMat = []

    fr = open(data)  #     

    for line in fr.readlines():  # readlines()            ,       Python   for... in ...       。       EOF       。
        currLine = line.strip().split()  #                ,         
        # lineArr = [1.0]
        lineArr = []

        for i in range(featureNum):
            lineArr.append(float(currLine[i + 1]))  #         lineArr 
        dataMat.append(lineArr)  # #            dataMat 

        labelMat.append(float(currLine[0]) * 2 - 1)
    return dataMat, labelMat


def sigmoid(inx):  #     sigmoid
    return 1.0 / (1 + exp(-inx))


def stocGradAscent(dataMatrix, classLabels, k, iter):  #     
    # dataMatrix   mat, classLabels      mat            
    m, n = shape(dataMatrix)
    alpha = 0.01
    #      
    w = zeros((n, 1))  #   n      
    w_0 = 0.
    v = normalvariate(0, 0.2) * ones((n, k))  # normalvariate          

    for it in range(iter):
        print
        it
        for x in range(m):  #     ,         
            inter_1 = dataMatrix[x] * v
            inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)  # multiply      
            #      
            interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.

            p = w_0 + dataMatrix[x] * w + interaction  #        

            loss = sigmoid(classLabels[x] * p[0, 0]) - 1
            print
            loss

            w_0 = w_0 - alpha * loss * classLabels[x]

            for i in range(n):
                if dataMatrix[x, i] != 0:
                    w[i, 0] = w[i, 0] - alpha * loss * classLabels[x] * dataMatrix[x, i]
                    for j in range(k):
                        v[i, j] = v[i, j] - alpha * loss * classLabels[x] * (
                                    dataMatrix[x, i] * inter_1[0, j] - v[i, j] * dataMatrix[x, i] * dataMatrix[x, i])

    return w_0, w, v


def getAccuracy(dataMatrix, classLabels, w_0, w, v):
    m, n = shape(dataMatrix)
    allItem = 0
    error = 0
    result = []
    for x in range(m):
        allItem += 1
        inter_1 = dataMatrix[x] * v
        inter_2 = multiply(dataMatrix[x], dataMatrix[x]) * multiply(v, v)  # multiply      
        #      
        interaction = sum(multiply(inter_1, inter_1) - inter_2) / 2.
        p = w_0 + dataMatrix[x] * w + interaction  #        

        pre = sigmoid(p[0, 0])

        result.append(pre)

        if pre < 0.5 and classLabels[x] == 1.0:
            error += 1
        elif pre >= 0.5 and classLabels[x] == -1.0:
            error += 1
        else:
            continue

    print
    result

    return float(error) / allItem


if __name__ == '__main__':
    dataTrain, labelTrain = loadDataSet(trainData)
    dataTest, labelTest = loadDataSet(testData)
    date_startTrain = datetime.now()  #         
    print
    "    "
    w_0, w, v = stocGradAscent(mat(dataTrain), labelTrain, 20, 200)
    print
    "      :%f" % (1 - getAccuracy(mat(dataTrain), labelTrain, w_0, w, v))
    date_endTrain = datetime.now()
    print
    "     :%s" % (date_endTrain - date_startTrain)
    print
    "    "
    print
    "      :%f" % (1 - getAccuracy(mat(dataTest), labelTest, w_0, w, v))