Numpy/Pandas平均処理データ欠落値


# -*- coding: utf-8 -*-
#-----------------------------------------------------------------------------------------------------------------------
__Author__ = 'assasin'
__DateTime__ = '2020/1/5 15:13'
#-----------------------------------------------------------------------------------------------------------------------

'''
       
Numpy         
Pandas    ,        
Pandas       
Pandas      :    ,   ,   ,   
'''

import numpy as np
import pandas as pd
from numpy import *

def loadDataSet(filepath,delim='\t'):
    fr = open(filepath)
    stringArr = [line.strip().split(delim) for line in fr.readlines()]
    #print(stringArr)
    dataArr = [list(map(float,line)) for line in stringArr]
    return mat(dataArr)


def replaceNanwithMean(dataArr):
    numfeat = shape(dataArr)
    for i in range(numfeat[1]-1):
        meanVal = mean(dataArr[nonzero((~isnan(dataArr[:,i].A))[0],i)])
        dataArr[nonzero(isnan(dataArr[:,i].A))[0],i] = meanVal

    return dataArr





if __name__ == '__main__':
    #      
    dataArr = loadDataSet(r'../xxx.txt','    ')

    #        
    replaceNanwithMean(dataArr)

    datamat = loadDataSet(r'../xxx.txt','    ')
    df = pd.DataFrame(datamat)
    #     
    df = df.reindex(range(datamat.shape[0] + 5 ))
    # NAN   0
    loassVs = [df[col].mean()  for col in range(datamat.shape[1])]
    lists = [list(df[i].fillna(loassVs[i]))  for i in range(len(loassVs))]
    print(mat(lists).T)