线性回归:通过历史数据拟合出一条直线,用这条直线对新的数据进行预测。

房价预测分析:

package cn.doitedu.ml

import org.apache.log4j.{Level, Logger}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.linalg
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.UserDefinedFunction

import scala.collection.mutable

object LinearPrice {
  def main(args: Array[String]): Unit = {
    Logger.getLogger("org.apache").setLevel(Level.WARN)

    val spark = SparkSession
      .builder()
      .appName("房价预测")
      .master("local")
      .getOrCreate()
    import org.apache.spark.sql.functions._
    import spark.implicits._


    val df = spark.read.option("header",true).option("inferSchema",true).csv("userprofile/data/linear/sample")

    val arr2Vec: UserDefinedFunction = udf((arr:mutable.WrappedArray[Double])=>{
      // Vector是一个接口,它有两个实现,一个是DenseVector,一个是SparseVector
      val vector: linalg.Vector = Vectors.dense(arr.toArray)
      vector
    })

    // area,floor,price
    val vecDF = df.select(arr2Vec(array('area,'floor)) as "features",'price )

    // 构造算法对象
    val linearRegression = new LinearRegression()
      .setRegParam(0.1)   // 正则化参数  防止过拟合
      .setLabelCol("price")
      .setFeaturesCol("features")

    // 训练模型
    val model = linearRegression.fit(vecDF)

    // 加载测试数据
    val test = spark.read.option("header",true).option("inferSchema",true).csv("userprofile/data/linear/test")
    val testVecDF = df.select(arr2Vec(array('area,'floor)) as "features",'price )

    // 用训练好的模型,来对测试数据进行输出的预测
    val result = model.transform(testVecDF)

    result.show(100,false)

    // 评估预测效果
    val regressionEvaluator = new RegressionEvaluator()
        .setPredictionCol("prediction")
        .setLabelCol("price")
        .setMetricName("rmse")

    val rmse: Double = regressionEvaluator.evaluate(result)
    spark.close()
  }
}