机器学习重点研究如何让机器人模拟人类的学习行为,用以获取新的知识和技能,改善具体算法的性能。分为监督学习、无监督学习、半监督学习、强化学习。

MLlib(即machine learning lib)是spark对常用的机器学习算法的实现库,同时包括相关的测试和数据生成器,有速度快、易用性、集成度高的特点。

Spark MLlib架构分为:1底层基础:包括spark的运行库、矩阵库和向量库2.算法库:包含广义线性模型、推荐系统、聚类、决策树和评估的算法3.使用程序:测试数据的生成、外部数据的读入等功能

MLlib支持本地的密集向量和稀疏向量,并且支持标量向量vectors

MLlib支持本地矩阵和分布式矩阵

主要有分类算法:



1 package ling
  2 
  3 import org.apache.spark.{SparkContext, SparkConf}
  4 
  5 object SparkMLlibTest extends App {
  6   import org.apache.spark.mllib.linalg.{Vector, Vectors}
  7   val dv: Vector = Vectors.dense(1.0, 0.0, 3.0) // 创建密集向量 (1.0, 0.0, 3.0)
  8   val sv1: Vector = Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)) // 创建稀疏向量 (1.0, 0.0, 3.0)
  9   val sv2: Vector = Vectors.sparse(3, Seq((0, 1.0), (2, 3.0)))
 10   println(dv)
 11   println(sv1)
 12   println(sv2)
 13 
 14   import org.apache.spark.mllib.linalg.Vectors
 15   import org.apache.spark.mllib.regression.LabeledPoint
 16   val pos = LabeledPoint(1.0, Vectors.dense(1.0, 0.0, 3.0))
 17   val neg = LabeledPoint(0.0, Vectors.sparse(3, Array(0, 2), Array(1.0, 3.0)))
 18   println(pos)
 19   println(neg)
 20 
 21   import org.apache.spark.mllib.linalg.{Matrix, Matrices}
 22   val dm: Matrix = Matrices.dense(3, 2, Array(1.0, 3.0, 5.0, 2.0, 4.0, 6.0)) //创建密集矩阵((1.0, 2.0), (3.0, 4.0), (5.0, 6.0))
 23   val sm: Matrix = Matrices.sparse(3,2, Array(0, 1, 3), Array(1, 2,0), Array(9, 6,8)) //创建稀疏矩阵 matrix ((0.0, 1.0, 3.0), (0.0, 2.0, 1.0), (9.0, 6.0, 8.0))
 24   println(dm)
 25   println(sm)
 26 
 27 //  //************create RowMatrix****************
 28 //  import org.apache.spark.mllib.linalg.{Vectors,Vector }
 29 //  import org.apache.spark.mllib.linalg.distributed.RowMatrix
 30 //  import org.apache.spark.rdd.RDD
 31 //  val conf=new SparkConf().setMaster("local[2]").setAppName("test")
 32 //  val sc=new SparkContext(conf)
 33 //  val rows:RDD[Vector]= sc.makeRDD(Seq(Vectors.dense(1,2,3),Vectors.dense(4,5,6))) // an RDD of local vectors
 34 //  val mat: RowMatrix = new RowMatrix(rows) //基于 RDD[Vector]创建RowMatrix
 35 //  val m = mat.numRows()//获取行数
 36 //  val n = mat.numCols()//获取列数
 37 //  println(m)
 38 //  println(n)
 39 
 40 
 41 //  //************create IndexedRowMatrix****************
 42 //  import org.apache.spark.mllib.linalg.{Vectors,Vector }
 43 //  import org.apache.spark.mllib.linalg.distributed.RowMatrix
 44 //  import org.apache.spark.rdd.RDD
 45 //  import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
 46 //  val conf=new SparkConf().setMaster("local[2]").setAppName("test")
 47 //  val sc=new SparkContext(conf)
 48 //  val iv1=new IndexedRow(0,Vectors.dense(1,2,3))
 49 //  val iv2=new IndexedRow(1,Vectors.dense(4,5,6))
 50 //
 51 //  val rows: RDD[IndexedRow] = sc.makeRDD(Seq(iv1,iv2))
 52 //  val mat: IndexedRowMatrix = new IndexedRowMatrix(rows)
 53 //  val m = mat.numRows()
 54 //  val n = mat.numCols()
 55 //  val rowMat: RowMatrix = mat.toRowMatrix()
 56 //    println(m)
 57 //    println(n)
 58 
 59 
 60 
 61 //  //************create CoordinateMatrix****************
 62 //  import org.apache.spark.mllib.linalg.{Vectors,Vector }
 63 //  import org.apache.spark.mllib.linalg.distributed.RowMatrix
 64 //  import org.apache.spark.rdd.RDD
 65 //  import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
 66 //  import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
 67 //  val conf=new SparkConf().setMaster("local[2]").setAppName("test")
 68 //  val sc=new SparkContext(conf)
 69 //  val e1= new MatrixEntry(0,0,10)
 70 //  val e2= new MatrixEntry(1,1,20)
 71 //  val e3= new MatrixEntry(2,2,30)
 72 //  val e4= new MatrixEntry(3,3,40)
 73 //  val entries: RDD[MatrixEntry] = sc.makeRDD(Seq(e1,e2,e3,e4))
 74 //  val mat: CoordinateMatrix = new CoordinateMatrix(rdd)
 75 //  val m = mat.numRows()
 76 //  val n = mat.numCols()
 77 //      println(m)
 78 //      println(n)
 79 //  //MatrixEntry 是对三元组 (Long, Long, Double)的封装
 80 
 81 
 82 
 83 ////  //************create distributed CoordinateMatrix****************
 84 //  import org.apache.spark.mllib.linalg.{Vectors,Vector }
 85 //  import org.apache.spark.mllib.linalg.distributed.RowMatrix
 86 //  import org.apache.spark.rdd.RDD
 87 //  import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry}
 88 //  import org.apache.spark.mllib.linalg.distributed.{IndexedRow, IndexedRowMatrix, RowMatrix}
 89 //  val conf=new SparkConf().setMaster("local[2]").setAppName("test")
 90 //  val sc=new SparkContext(conf)
 91 //  val rdd=sc.textFile("cloud01:9000//data/indexedmatrixdata.txt").map(line=>line.split(" "))
 92 //    .map(x=>{new MatrixEntry(x(0).toInt,x(1).toInt,x(2).toDouble)})
 93 //
 94 //  val mat: CoordinateMatrix = new CoordinateMatrix(rdd)
 95 //  val m = mat.numRows()
 96 //  val n = mat.numCols()
 97 //  println(m)
 98 //  println(n)
 99 
100 
101 
102 
103 
104 
105 //   //************example 1 for SVM************
106 //    import org.apache.spark.SparkContext
107 //    import org.apache.spark.mllib.linalg.{Vectors,Vector }
108 //    import org.apache.spark.mllib.classification.SVMWithSGD
109 //    import org.apache.spark.mllib.regression.LabeledPoint
110 //    import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
111 //    import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
112 //    import org.apache.spark.mllib.util.MLUtils
113 //    val conf=new SparkConf().setAppName("SVMtest")setMaster("local[2]")
114 //    val sc=new SparkContext(conf)
115 //    val data = sc.textFile("/home/dong/spark-1.4.0/data/mllib/sample_svm_data.txt")
116 //    val parsedData = data.map { line =>
117 //      val parts = line.split(' ')
118 //      LabeledPoint(parts(0).toDouble, Vectors.dense(parts.tail.map(x => x.toDouble).toArray))
119 //    }
120 //    // 设置迭代次数并进行进行训练
121 //    val numIterations = 20
122 //    val model = SVMWithSGD.train(parsedData, numIterations)
123 //    // 统计分类错误的样本比例
124 //    val labelAndPreds = parsedData.map { point =>
125 //      val prediction = model.predict(point.features)
126 //      (point.label, prediction)
127 //    }
128 //    val trainErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / parsedData.count
129 //    println("Training Error = " + trainErr)
130 
131 
132 // //************example 2 for SVM************
133 //  import org.apache.spark.SparkContext
134 //  import org.apache.spark.mllib.classification.SVMWithSGD
135 //  import org.apache.spark.mllib.regression.LabeledPoint
136 //  import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
137 //  import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
138 //  import org.apache.spark.mllib.util.MLUtils
139 //  val conf=new SparkConf().setAppName("SVMtest")setMaster("local[2]")
140 //    val sc=new SparkContext(conf)
141 //  // Load training data in LIBSVM format.
142 //  val data = MLUtils.loadLibSVMFile(sc, "/home/dong/spark-1.4.0/data/mllib/sample_libsvm_data.txt")
143 //  // Split data into training (60%) and test (40%).
144 //  val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
145 //  val training = splits(0).cache()
146 //  val test = splits(1)
147 //  // Run training algorithm to build the model
148 //  val numIterations = 100
149 //  val model = SVMWithSGD.train(training, numIterations)
150 //  // Clear the default threshold.
151 //  model.clearThreshold()
152 //
153 //  // Compute raw scores on the test set.
154 //  val scoreAndLabels = test.map { point =>
155 //    val score = model.predict(point.features)
156 //    (score, point.label)
157 //  }
158 //  // Get evaluation metrics.
159 //  val metrics = new BinaryClassificationMetrics(scoreAndLabels)
160 //  val auROC = metrics.areaUnderROC()
161 //
162 //  println("Area under ROC = " + auROC)
163 
164 
165 
166 
167 // //************example 3 for LinearRegression************
168 //  import org.apache.spark.mllib.regression.LinearRegressionWithSGD
169 //  import org.apache.spark.mllib.regression.LabeledPoint
170 //  import org.apache.spark.mllib.linalg.{Vectors,Vector }
171 //  val conf=new SparkConf().setAppName("test").setMaster("local[2]")
172 //  val sc= new SparkContext(conf)
173 //  // 加载和解析数据文件
174 //  val data = sc.textFile("/home/dong/spark-1.4.0/data/mllib/ridge-data/lpsa.data")
175 //  val parsedData = data.map { line =>
176 //    val parts = line.split(',')
177 //    LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(x => x.toDouble)))
178 //  }
179 //  //设置迭代次数并进行训练
180 //  val numIterations = 20
181 //  val model = LinearRegressionWithSGD.train(parsedData, numIterations)
182 //  // 统计回归错误的样本比例
183 //  val valuesAndPreds = parsedData.map { point =>
184 //    val prediction = model.predict(point.features)
185 //    (point.label, prediction)
186 //  }
187 //  val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
188 //  println("training Mean Squared Error = " + MSE)
189 
190 
191 
192 
193 
194 }



 

回归算法:



1 package ling
 2 
 3 import org.apache.spark.{SparkConf,SparkContext}
 4 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 5 import org.apache.spark.mllib.regression.LabeledPoint
 6 import org.apache.spark.mllib.linalg.{Vectors,Vector }
 7 import org.apache.log4j.{Level, Logger}
 8 
 9 
10 object LinearRegressionTest extends App{
11   Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
12   Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
13 
14   val conf=new SparkConf().setAppName("LinearRegressionTest").setMaster("local[2]")
15   val sc= new SparkContext(conf)
16   // 加载和解析数据文件
17   val data = sc.textFile("/home/hduser/spark-1.4.0/data/mllib/ridge-data/lpsa.data")
18   val parsedData = data.map { line =>
19     val parts = line.split(",")
20     LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(" ").map(x => x.toDouble)))
21   }
22   //设置迭代次数并进行训练50-200
23   val numIterations = 100
24   val model = LinearRegressionWithSGD.train(parsedData, numIterations)
25 
26   //统计回归错误的样本比例
27   val valuesAndPreds = parsedData.map { point =>
28     val prediction = model.predict(point.features)
29     (point.label, prediction)
30 
31   }
32   val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.reduce(_ + _)/valuesAndPreds.count
33   println("training Mean Squared Error = " + MSE)
34 
35 
36 //  val d= Vectors.dense(1.0, 0.0, 0.50,0.32,0.565,0.5,0.7,0.121)
37 //  val rdd=sc.makeRDD(Array(Vectors.dense(1.0, 0.0, 0.50,0.32,0.565,0.5,0.7,0.121),Vectors.dense(2.0, 1.0, 1.50,1.32,0.565,4.5,0.7,0.1)))
38 //  val prediction = model.predict(d)
39 //  val predictions=model.predict(rdd)
40 //  println(prediction)
41 //  predictions.foreach(println)
42 
43 
44 }



 

聚类算法:K

 



1 package ling
 2 
 3 object KMeansTest extends App{
 4 
 5   import org.apache.log4j.Logger
 6   import org.apache.log4j.Level
 7   import org.apache.spark.{SparkContext, SparkConf}
 8   import org.apache.spark.mllib.clustering.{KMeans,KMeansModel}
 9   import org.apache.spark.mllib.linalg.{Vectors,Vector }
10   import org.apache.spark.mllib.linalg.distributed.RowMatrix
11   import org.apache.spark.rdd.RDD
12 
13   Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
14   Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
15 
16   val conf = new SparkConf().setAppName("Kmeans").setMaster("local[2]")
17   val sc =new SparkContext(conf)
18 
19   val data=sc.textFile("/home/hduser/spark-1.4.0/data/mllib/kmeans_data.txt",1)
20   val parseData=data.map(s=>Vectors.dense(s.split(" ").map(_.toDouble))) //数据向量化过程
21 
22   val numClusters=2 //分为2类,即设2个中心点
23   val numIterations=15 //迭代次数
24   // 采用K-means方法生成训练模型model, 并利用for语句显示模型的分类情况
25   val model=KMeans.train(parseData, numClusters, numIterations)
26   println("Cluster centers")
27   for(c<-model.clusterCenters) {
28     println(" " + c.toString + " belongs to cluster " +model.predict(c))
29   }
30 
31 
32 
33 
34 
35 
36   //    评估.采用误差的平方和(每个点到中心点的距离平方的累加和)
37   val cost = model.computeCost(parseData)
38   println("Within the sum of squared Error : " + cost)
39 
40 
41 //  //    单点预测
42 //  println("predict:" + model.predict(Vectors.dense("0.1,0.2,0.3".split(",").map(_.toDouble))))
43 //  println("predict:" + model.predict(Vectors.dense("7.1,8.2,9.3".split(",").map(_.toDouble))))
44 //  println("predict:" + model.predict(Vectors.dense("4.5,4.5,4.5".split(",").map(_.toDouble))))
45      // 批量预测
46 //    val testdata=data.map(s=>Vectors.dense(s.split(" ").map(_.toDouble)))
47 //    val result=model.predict(testdata)
48 //    result.saveAsTextFile("/home/hduser/kmeansresult")   //只保存了预测的结果,没有保存模型数据。可读性差
49 
50 
51   // 同时保留模型数据与预测数据
52   val result=data.map{
53     line=>
54     {val linevectors=Vectors.dense(line.split(" ").map(_.toDouble))
55       val prediction=model.predict(linevectors)
56       line+" belongs to cluster "+prediction}
57   }.saveAsTextFile("/home/hduser/kmeansresult1")
58 
59 
60 }



 

协同过滤算法:其中重要的一个叫做交替最小二乘法,将用户矩阵分成两个矩阵,ALS.train



1 package ling
  2 
  3 import java.io.File
  4 import scala.io.Source
  5 
  6 import org.apache.log4j.Logger
  7 import org.apache.log4j.Level
  8 
  9 //**********************************
 10 import org.apache.spark.SparkContext
 11 import org.apache.spark.SparkConf
 12 import org.apache.spark.mllib.recommendation.ALS
 13 import org.apache.spark.mllib.recommendation.Rating
 14 //import org.jblas.DoubleMatrix
 15 import scala.math.Ordering
 16 import org.apache.spark.mllib.evaluation.RegressionMetrics
 17 import org.apache.spark.mllib.evaluation.RankingMetrics
 18 
 19 object MovieALS1 {
 20 
 21 
 22   def main(args: Array[String]) {
 23 
 24     Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
 25     Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
 26 
 27     val conf = new SparkConf().setAppName("ALS Application").setMaster("local[2]")
 28     val sc = new SparkContext(conf)
 29     val rawData = sc.textFile("/home/hduser/ml-100k/u.data")
 30     println("rawData文本格式")
 31     println(rawData.first())
 32 
 33 
 34     val rawRating = rawData.map(_.split("\t").take(3))
 35     val ratings = rawRating.map {
 36       case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toFloat)
 37     }
 38     println("ratings文本格式")
 39     ratings.take(3).foreach(println)
 40 
 41 
 42     //***训练ALS模型***
 43     val model= ALS.train(ratings,50,10,0.01)
 44 
 45     println("查看用户因子矩阵所含用户数目")
 46     println(model.userFeatures.count)
 47     println("查看用户因子矩阵数据格式")
 48     model.userFeatures.take(3).map(x=>(x._1,x._2.toBuffer,x._2.toBuffer.size)).foreach(println)
 49     println("********************")
 50     println("查看物品因子矩阵所含用户数目")
 51     println(model.productFeatures.count)
 52     println("查看物品因子矩阵数据格式")
 53     model.productFeatures.take(3).map(x=>(x._1,x._2.toBuffer,x._2.toBuffer.size)).foreach(println)
 54 
 55 
 56 
 57     //使用方式1:预测用户789对电影123的评分
 58     val predictedRating = model.predict(789, 123)
 59     println("用户789对电影123的评分")
 60     println(predictedRating)
 61 
 62 
 63 
 64     //使用方式2:批量用户商品预测
 65     val ratings1= ratings.map{case Rating(user,movie,rating) =>(user, movie) }
 66     val predictedRating1= model.predict(ratings1)
 67     println
 68     println("批量用户商品预测")
 69     predictedRating1.take(5).foreach(println)
 70 
 71 
 72 
 73     //使用方式3:为用户789推荐10部电影
 74     val userId = 789
 75     val K = 10
 76     val topKRecs = model.recommendProducts(userId, K)
 77     println("为用户789推荐的10部电影:")
 78     println(topKRecs.mkString("\n"))
 79 
 80 
 81 
 82 
 83     //使用方式4:校验模型推荐情况
 84     val movies = sc.textFile("/home/hduser/ml-100k/u.item")
 85     val titles = movies.map(line => line.split("\\|").take(2)).
 86       map(array => (array(0).toInt,array(1))).collectAsMap()
 87     println("编号为123的电影名称:")
 88     println(titles(123))
 89 
 90     //查看用户789共评分过哪些电影
 91     val moviesForUser = ratings.keyBy(_.user).lookup(789)
 92     println("用户789评分过电影的数量:")
 93     println(moviesForUser.size)
 94 
 95     //将用户789评分的电影按评分降序排序,并输出前10部电影
 96     println("用户789评分过电影前10名:")
 97     moviesForUser.sortBy(-_.rating).take(10).map(rating => (titles(rating.
 98       product), rating.rating)).foreach(println)
 99     println("模型预测用户789喜欢电影前10名:")
100     topKRecs.map(rating => (titles(rating.product), rating.rating)).foreach(println)
101 
102   }
103 
104 }