import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
//数据格式:类别,特征1 特征2 特征3
//0,1 0 1
//1,0 2 0
object tmp_naive_bayes {
def main(args: Array[String]){
//1、构建spark对象
val conf = new SparkConf().setAppName("naive_bayes").setMaster("local")
val sc = new SparkContext(conf)
Logger.getRootLogger.setLevel(Level.WARN)
//2、读取数据样本
val data = sc.textFile("C://Users/wpguoc/Desktop/Spark MLlib/navie_bayes_data.txt")
val parsedData = data.map{ line =>
val parts = line.split(',')
LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
}
//3、样本数据划分训练样本和测试样本
val splits = parsedData.randomSplit(Array(0.6, 0.4), seed = 11L)
val tran = splits(0)
val test = splits(1)
//4、新建贝叶斯分类模型,并训练
val model = NaiveBayes.train(tran, lambda = 1.0, modelType = "multinomial")
//5、对测试样本进行测试
val predictionAndLabel = test.map(p => (model.predict(p.features), p.label))
val print_predict = predictionAndLabel.take(20)
println("贝叶斯分类结果:" + "\n" + "prediction" + "\t" + "label")
for(i <- 0 to print_predict.length - 1){
println(print_predict(i)._1+"\t\t\t"+print_predict(i)._2)
}
val accuracy = 1.0*predictionAndLabel.filter(x =>x._1 == x._2).count() / test.count()
println("贝叶斯分类精度" + "\n" + "accuracy: Double = " + accuracy)
//6、保存模型
val ModelPath = "C://Users/wpguoc/Desktop/Spark_MLlib/"
model.save(sc, ModelPath)
val sameModel = NaiveBayesModel.load(sc, ModelPath)
}
}
运行过程及结果
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
18/12/19 17:32:04 INFO SparkContext: Running Spark version 1.6.3
18/12/19 17:32:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
18/12/19 17:32:05 INFO SecurityManager: Changing view acls to: wpguoc
18/12/19 17:32:05 INFO SecurityManager: Changing modify acls to: wpguoc
18/12/19 17:32:05 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(wpguoc); users with modify permissions: Set(wpguoc)
18/12/19 17:32:05 INFO Utils: Successfully started service 'sparkDriver' on port 63424.
18/12/19 17:32:06 INFO Slf4jLogger: Slf4jLogger started
18/12/19 17:32:06 INFO Remoting: Starting remoting
18/12/19 17:32:06 INFO Remoting: Remoting started; listening on addresses :[akka.tcp://sparkDriverActorSystem@192.168.66.80:63437]
18/12/19 17:32:06 INFO Utils: Successfully started service 'sparkDriverActorSystem' on port 63437.
18/12/19 17:32:06 INFO SparkEnv: Registering MapOutputTracker
18/12/19 17:32:06 INFO SparkEnv: Registering BlockManagerMaster
18/12/19 17:32:06 INFO DiskBlockManager: Created local directory at C:\Users\wpguoc\AppData\Local\Temp\blockmgr-4d798e34-90b0-4ee9-a811-586a893f4818
18/12/19 17:32:06 INFO MemoryStore: MemoryStore started with capacity 1127.3 MB
18/12/19 17:32:06 INFO SparkEnv: Registering OutputCommitCoordinator
18/12/19 17:32:06 INFO Utils: Successfully started service 'SparkUI' on port 4040.
18/12/19 17:32:06 INFO SparkUI: Started SparkUI at http://192.168.66.80:4040
18/12/19 17:32:06 INFO Executor: Starting executor ID driver on host localhost
18/12/19 17:32:06 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 63444.
18/12/19 17:32:06 INFO NettyBlockTransferService: Server created on 63444
18/12/19 17:32:06 INFO BlockManagerMaster: Trying to register BlockManager
18/12/19 17:32:06 INFO BlockManagerMasterEndpoint: Registering block manager localhost:63444 with 1127.3 MB RAM, BlockManagerId(driver, localhost, 63444)
18/12/19 17:32:06 INFO BlockManagerMaster: Registered BlockManager
18/12/19 17:32:10 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
18/12/19 17:32:10 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
贝叶斯分类结果:
prediction label
0.0 0.0
0.0 0.0
2.0 2.0
2.0 2.0
2.0 2.0
贝叶斯分类精度
accuracy: Double = 1.0
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
18/12/19 17:32:14 WARN ParquetRecordReader: Can not initialize counter due to context is not a instance of TaskInputOutputContext, but is org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl
Process finished with exit code 0