Sparkマシン学習の2つのパッケージ方式
954 ワード
1.MLlib
2.ML
# LabeledPoint
#rdd -> LabeledPoint
#LabeledPoint(y , )
#y Dobule
# Vectors spark
# LabeledPoint ,Vectors
labeledpoint = RDD.map(lambda x:(x[0],Vectors.dense(x[1:]))
#
#
model = .train(labeledpoint)
model.predict(test_labeledpoint_features)
2.ML
# dataframe dataset
#dataframe -> dataset
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
#lst_col list
vecAssembler = VectorAssembler(inputCols=lst_col, outputCol="features")
stringIndexer = StringIndexer(inputCol="y", outputCol="label")
pipeline = Pipeline(stages=[vecAssembler, stringIndexer])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
#
model = .fit(trainingData)
lrModel.transform(testData)