Sparkマシン学習の2つのパッケージ方式

954 ワード

1.MLlib
 
#      LabeledPoint
#rdd -> LabeledPoint
#LabeledPoint(y ,   )
#y  Dobule 
#    Vectors  spark    

#   LabeledPoint ,Vectors 
labeledpoint = RDD.map(lambda x:(x[0],Vectors.dense(x[1:]))

#     
#     
model =   .train(labeledpoint)
model.predict(test_labeledpoint_features)

 
2.ML
#     dataframe dataset

#dataframe -> dataset

from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

#lst_col   list      
vecAssembler = VectorAssembler(inputCols=lst_col, outputCol="features")
stringIndexer = StringIndexer(inputCol="y", outputCol="label")

pipeline = Pipeline(stages=[vecAssembler, stringIndexer])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

#     
model =   .fit(trainingData)
lrModel.transform(testData)