pyspark groupby去重 pyspark collect_list

转载

mob64ca13f70606 2024-08-16 09:05:09

文章标签 pyspark groupby去重 pyspark 机器学习分类回归 文章分类 Spark 大数据

1. 二元分类

预测网页是暂时性的，还是长青的（ephemeral, evergreen）

》读取文件，创建DataFrame 格式数据

from pyspark.context import SparkContext 
from pyspark.sql.session import SparkSession
# sc = SparkContext("local")
# spark = SparkSession(sc)

# 读取文本文件， 创建为DataFrame 结构
row_df = spark.read.format("csv").option("header", "true").option("delimiter", "\t").load(r"D:\data\spark\train.tsv")
row_df.count()
# 7395

row_df.printSchema()

root
 |-- url: string (nullable = true)
 |-- urlid: string (nullable = true)
 |-- boilerplate: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: string (nullable = true)
 |-- avglinksize: string (nullable = true)
 |-- commonlinkratio_1: string (nullable = true)
 |-- commonlinkratio_2: string (nullable = true)
 |-- commonlinkratio_3: string (nullable = true)
..........

row_df.select('url', 'alchemy_category', 'alchemy_category_score', 'is_news', 'label').show(10)

+--------------------+------------------+----------------------+-------+-----+
|                 url|  alchemy_category|alchemy_category_score|is_news|label|
+--------------------+------------------+----------------------+-------+-----+
|http://www.bloomb...|          business|              0.789131|      1|    0|
|http://www.popsci...|        recreation|              0.574147|      1|    1|
|http://www.menshe...|            health|              0.996526|      1|    1|
|http://www.dumbli...|            health|              0.801248|      1|    1|
|http://bleacherre...|            sports|              0.719157|      1|    0|
|http://www.conven...|                 ?|                     ?|      ?|    0|
|http://gofashionl...|arts_entertainment|               0.22111|      1|    1|
|http://www.inside...|                 ?|                     ?|      ?|    0|
|http://www.valetm...|                 ?|                     ?|      1|    1|
|http://www.howswe...|                 ?|                     ?|      ?|    1|
+--------------------+------------------+----------------------+-------+-----+
only showing top 10 rows

》自定义函数 udf 将？转换成 0

from pyspark.sql.functions import udf
def replace_question(x):
    return ("0" if x=="?" else x)
replace_question = udf(replace_question)


from pyspark.sql.functions import col
import pyspark.sql.types

# 将第 4个字段开始转换成 double类型
df = row_df.select(['url', 'alchemy_category'] + [replace_question(col(column)).cast("double").alias(column) for column in row_df.columns[4:]])

df.printSchema()

root
 |-- url: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: double (nullable = true)
 |-- avglinksize: double (nullable = true)
 |-- commonlinkratio_1: double (nullable = true)
 |-- commonlinkratio_2: double (nullable = true)............

》划分训练集和测试集, 并将其暂存到内存中

#
train_df, test_df = df.randomSplit([0.7, 0.3])
train_df.cache()
test_df.cache()

》先将字符串形式转换成数字形式

from pyspark.ml.feature import StringIndexer

categoryIndexer = StringIndexer(inputCol='alchemy_category', outputCol='alchemy_category_Index')
categoryTransfomer = categoryIndexer.fit(df)
category_train = categoryTransfomer.transform(train_df)
category_train.select("alchemy_category", "alchemy_category_Index").show(5)

+------------------+----------------------+
|  alchemy_category|alchemy_category_Index|
+------------------+----------------------+
|arts_entertainment|                   2.0|
|        recreation|                   1.0|
|                 ?|                   0.0|
|                 ?|                   0.0|
|          business|                   3.0|
+------------------+----------------------+
only showing top 5 rows

》进行独热编码

from pyspark.ml.feature import OneHotEncoder

onehot = OneHotEncoder(dropLast= False, inputCol="alchemy_category_Index", outputCol= "alchemy_category_vector")
df2 = onehot.transform(category_train)
df2.select("alchemy_category", "alchemy_category_Index", "alchemy_category_vector").show(5)
#输出以个 14 维 的矩阵形式

+------------------+----------------------+-----------------------+
|  alchemy_category|alchemy_category_Index|alchemy_category_vector|
+------------------+----------------------+-----------------------+
|arts_entertainment|                   2.0|         (14,[2],[1.0])|
|        recreation|                   1.0|         (14,[1],[1.0])|
|                 ?|                   0.0|         (14,[0],[1.0])|
|                 ?|                   0.0|         (14,[0],[1.0])|
|          business|                   3.0|         (14,[3],[1.0])|
+------------------+----------------------+-----------------------+
only showing top 5 rows

》将几个特征列转换成 1 列的行向量，并统一命名。在建模时，只需使用该集合特征就可以。

from pyspark.ml.feature import VectorAssembler

#选择要进行转换的多个特征
asseblerInputs = ["alchemy_category_vector"] + row_df.columns[4: -1]


assembler = VectorAssembler(inputCols=asseblerInputs, outputCol= "features")
df3 = assembler.transform(df2)
print(df3.columns)

['url', 'alchemy_category', 'alchemy_category_score', 'avglinksize', 'commonlinkratio_1', 'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4', 'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio', 'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news', 'lengthyLinkDomain', 'linkwordscore', 'news_front_page', 'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url', 'parametrizedLinkRatio', 'spelling_errors_ratio', 'label', 'alchemy_category_Index', 'alchemy_category_vector', 'features']

df3.select("features").first() #抽取得到所需要建模的特征
Out[22]:
Row(features=SparseVector(36, {2: 1.0, 14: 0.5811, 15: 2.5268, 16: 0.6807, 17: 0.3277, 18: 0.2689, 19: 0.2605, 20: 0.471, 23: 0.0277, 25: 0.2184, 26: 0.1053, 27: 1.0, 28: 1.0, 29: 13.0, 31: 16382.0, 32: 238.0, 33: 8.0, 34: 0.3361, 35: 0.0943}))
In [23]:

df2.select("spelling_errors_ratio").first()
Out[23]:
Row(spelling_errors_ratio=0.094298246)

》使用决策树进行建模

#
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14)
dt_model = dt.fit(df3)
print(dt_model)

》由于上述同样对测试集需要进行同样操作，所以使用较为麻烦，最好是使用pipeline

#1. 建立pipeline
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier

stringIndex = StringIndexer(inputCol="alchemy_category", outputCol="alchemy_category_index")
onehoter = OneHotEncoder(dropLast=False, inputCol="alchemy_category_index", outputCol="alchemy_category_vector")
assemleInputs = ["alchemy_category_vector"] + row_df.columns[4: -1]
assembler = VectorAssembler(inputCols=asseblerInputs, outputCol="features")
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", maxDepth=10, maxBins=14)
pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, dt])
pipeline.getStages() #查看pipeline 的阶段

[StringIndexer_6c703f5b0925,
 OneHotEncoder_d71a3c746298,
 VectorAssembler_72028ec11847,
 DecisionTreeClassifier_ecf3417a4a68]

》使用pipeline 进行数据处理和模型训练

pipelineModel = pipeline.fit(train_df)
pipelineModel.stages[3]
# 进行预测
predicted = pipelineModel.transform(test_df)
predicted.columns

'spelling_errors_ratio',
 'label',
 'alchemy_category_index',
 'alchemy_category_vector',
 'features',
 'rawPrediction',
 'probability',
 'prediction']

# 预测的类别，预测的概率
predicted.select("rawPrediction", "probability", "prediction", "label").take(5)

[Row(rawPrediction=DenseVector([56.0, 123.0]), probability=DenseVector([0.3128, 0.6872]), prediction=1.0, label=1.0),
 Row(rawPrediction=DenseVector([132.0, 32.0]), probability=DenseVector([0.8049, 0.1951]), prediction=0.0, label=0.0),
 Row(rawPrediction=DenseVector([132.0, 32.0]), probability=DenseVector([0.8049, 0.1951]), prediction=0.0, label=0.0),
 Row(rawPrediction=DenseVector([28.0, 221.0]), probability=DenseVector([0.1124, 0.8876]), prediction=1.0, label=1.0),
 Row(rawPrediction=DenseVector([120.0, 46.0]), probability=DenseVector([0.7229, 0.2771]), prediction=0.0, label=0.0)]

》使用auc 对模型进行评估

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")
auc = evaluator.evaluate(predicted)
auc    #0.62

》使用TrainValidationSplit找到最好参数

from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder().addGrid(dt.impurity, ["gini", "entropy"]).addGrid(dt.maxDepth, [5, 10, 15]).addGrid(dt.maxBins, [10, 15, 20]).build()
# 在训练集找到最好的参数
tsv = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8)
tsv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, tsv]) #将 dt 转换成 tsv 
tsv_pipelineModel = tsv_pipeline.fit(train_df)
bestModel = tsv_pipelineModel.stages[3].bestModel #找出最好的模型
bestModel
tsv_prediction = tsv_pipelineModel.transform(test_df)
tsv_auc = evaluator.evaluate(tsv_prediction)
tsv_auc    #0.65

》使用交叉验证方法

from pyspark.ml.tuning import CrossValidator

cv = CrossValidator(estimator=dt, evaluator= evaluator, estimatorParamMaps=paramGrid, numFolds=3)
cv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, cv])
cv_pipelineModel = cv_pipeline.fit(train_df)
bestModel = cv_pipelineModel.stages[3].bestModel
bestModel
cv_prediction = cv_pipelineModel.transform(test_df)
cv_auc = evaluator.evaluate(cv_prediction)
cv_auc  # 0.67

》使用随机森林，随机森林表现较好

from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50)
rfpipeline = Pipeline(stages = [stringIndex, onehoter, assembler, rf])
rfpipelineModel = rfpipeline.fit(train_df)
rf_preddiction = rfpipelineModel.transform(test_df)
evaluator.evaluate(rf_preddiction) # 0.73

#使用交叉验证, 使用 3 折 交叉验证获取最好的参数
paramGrid = ParamGridBuilder().addGrid(rf.impurity, ["gini", "entropy"]).addGrid(rf.maxDepth, [5, 10, 15]).addGrid(rf.maxBins, [10, 15, 20]). \
addGrid(rf.numTrees, [10, 20, 60]).build()
cv_rf = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)
rfcv_pipeline = Pipeline(stages = [stringIndex, onehoter, assembler, cv_rf])
rfcvModel = rfcv_pipeline.fit(train_df)
rfcv_prediction = rfcvModel.transform(test_df)
evaluator.evaluate(rfcv_prediction)    # 0.75

2.多元分类

预测森林的覆盖类别

》读取文件

file = r'D:\data\spark\covtype.data'
rawData = sc.textFile(file)
lines = rawData.map(lambda x: x.split(","))
lines.count() #581012

#查看字段数
fieldNum = len(lines.first())
fieldNum # 55

》创建列名和DataFrame

from pyspark.sql.types import StringType, StructField, StructType
fields = [StructField("f" + str(i), StringType(), True) for i in range(fieldNum)]
schema = StructType(fields)
covtype_df = spark.createDataFrame(lines, schema)
covtype_df.columns
covtype_df.printSchema()

root
 |-- f0: string (nullable = true)
 |-- f1: string (nullable = true)
 |-- f2: string (nullable = true)
 |-- f3: string (nullable = true)
 |-- f4: string (nullable = true)
 |-- f5: string (nullable = true)
 |-- f6: string (nullable = true)
 |-- f7: string (nullable = true)
 |-- f8: string (nullable = true)

》将字符串类型转换成double 类型

#转换成double 类型, col用于读取字段的数据
from pyspark.sql.functions import col
covtype_df = covtype_df.select([col(column).cast("double").alias(column) for column in covtype_df.columns])
covtype_df.show(2)

# 设置label ，withColumn， 最后一行设置为类别，从0开始
covtype_df = covtype_df.withColumn("label", covtype_df["f54"]-1).drop("f54")
covtype_df.show(1)

+------+----+---+-----+---+-----+-----+-----+-----+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+
|    f0|  f1| f2|   f3| f4|   f5|   f6|   f7|   f8|    f9|f10|f11|f12|f13|f14|f15|f16|f17|f18|f19|f20|f21|f22|f23|f24|f25|f26|f27|f28|f29|f30|f31|f32|f33|f34|f35|f36|f37|f38|f39|f40|f41|f42|f43|f44|f45|f46|f47|f48|f49|f50|f51|f52|f53|label|
+------+----+---+-----+---+-----+-----+-----+-----+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+
|2596.0|51.0|3.0|258.0|0.0|510.0|221.0|232.0|148.0|6279.0|1.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|1.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|0.0|  4.0|
+------+----+---+-----+---+-----+-----+-----+-----+------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+-----+
only showing top 1 row

》将数据分为训练集和测试集

#将数据随机分为训练集和测试集， 并将它们存与内存中，加快后序运行速度 cache()
train_df, test_df = covtype_df.randomSplit(weights=[0.7, 0.3], seed=666)#设立随机种子
train_df.cache()
test_df.cache()

》使用pipeline

#进行训练
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import evaluation
featuresCols = covtype_df.columns[:54]
# featuresCols
vec = VectorAssembler(inputCols=featuresCols, outputCol="features")
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", impurity="gini", seed=666, maxBins=20, maxDepth=5)
dt_pipeline = Pipeline(stages = [vec, dt])
dtModel = dt_pipeline.fit(train_df)
dt_prediction = dtModel.transform(test_df)
dt_prediction.columns

dt_prediction.select(["features", "probability", "rawPrediction", "prediction", "label"]).show(2)

+--------------------+--------------------+--------------------+----------+-----+
|            features|         probability|       rawPrediction|prediction|label|
+--------------------+--------------------+--------------------+----------+-----+
|(54,[0,1,2,3,4,5,...|[0.0,0.0244421380...|[0.0,471.0,12438....|       2.0|  2.0|
|(54,[0,1,2,3,4,5,...|[0.0,0.0244421380...|[0.0,471.0,12438....|       2.0|  2.0|
+--------------------+--------------------+--------------------+----------+-----+
only showing top 2 rows

》对模型进行评估

#对模型准确率进行评估
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName= "accuracy") #使用准确率进行评估
acc = evaluator.evaluate(dt_prediction)
acc # 0.70

# 找出最佳模型, TrainValidationSplit需要传入分类器，参数，评估器
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder().addGrid(dt.impurity, ["gini", "entropy"]).addGrid(dt.maxDepth, [10, 15, 25]).addGrid(dt.maxBins, [30, 40, 50]).build()
tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8, seed=666)
tsv_pipeline = Pipeline(stages=[vec, tvs])
tsvMoodel = tsv_pipeline.fit(train_df)

bestModel = tsvMoodel.stages[1].bestModel
bestModel.toDebugString[:500] #对建树过程进行查看

# 使用 withColumnRenamed 对列名进行重新命名
predictions = tsvMoodel.transform(test_df)
result = predictions.withColumnRenamed("f0", "海拔").withColumnRenamed("f1", "方位") \
.withColumnRenamed("f2", "斜率").withColumnRenamed("f3", "垂直距离") \
.withColumnRenamed("f4", "水平距离").withColumnRenamed("f5", "阴影")
result.select("海拔", "方位", "斜率", "垂直距离", "水平距离",  "阴影", "label", "prediction").show(5)

+------+----+----+--------+--------+-----+-----+----------+
|  海拔|方位|斜率|垂直距离|水平距离| 阴影|label|prediction|
+------+----+----+--------+--------+-----+-----+----------+
|1867.0|20.0|15.0|   108.0|    19.0|120.0|  2.0|       2.0|
|1876.0|25.0|17.0|   124.0|    26.0|150.0|  2.0|       2.0|
|1876.0|29.0|19.0|   124.0|    34.0| 90.0|  5.0|       5.0|
|1877.0|27.0|24.0|    90.0|    18.0| 95.0|  5.0|       5.0|
|1883.0|29.0|24.0|    60.0|    24.0|108.0|  5.0|       5.0|
+------+----+----+--------+--------+-----+-----+----------+
only showing top 5 rows

》计算准确率

#计算准确率
acc = evaluator.evaluate(predictions)
acc
#计算准确率
acc = evaluator.evaluate(predictions)
acc
Out[86]:
0.9292631820660754

3. 回归

“bike sharing” 数据集用于预测在不同情况下的单车的每一小时的租用的数量

》创建DataFrame

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark Regression").config("spark.some.config.option", "some-value").getOrCreate()
# 创建 DataFrame
file = r'D:\data\spark\hour.csv'
hour_df = spark.read.format('csv').option("header", "true").load(file)
hour_df.count()    # 17379
hour_df.printSchema()

root
 |-- instant: string (nullable = true)
 |-- dteday: string (nullable = true)
 |-- season: string (nullable = true)
 |-- yr: string (nullable = true)
 |-- mnth: string (nullable = true)
 |-- hr: string (nullable = true)
 |-- holiday: string (nullable = true)
 |-- weekday: string (nullable = true)
 |-- workingday: string (nullable = true)
 |-- weathersit: string (nullable = true)
 |-- temp: string (nullable = true)
 |-- atemp: string (nullable = true)
 |-- hum: string (nullable = true)
 |-- windspeed: string (nullable = true)
 |-- casual: string (nullable = true)
 |-- registered: string (nullable = true)
 |-- cnt: string (nullable = true)

# 删除不需要的字段
hour_df = hour_df.drop("instant").drop("yr").drop("dteday").drop("casual").drop("registered")

#将字符串类型转换成 double 类型, col 读取字段数据， cast("double") 转换成double
from pyspark.sql.functions import col
hour_df = hour_df.select([col(column).cast("double").alias(column) for column in hour_df.columns])
hour_df.printSchema()

root
 |-- season: double (nullable = true)
 |-- mnth: double (nullable = true)
 |-- hr: double (nullable = true)
 |-- holiday: double (nullable = true)
 |-- weekday: double (nullable = true)
 |-- workingday: double (nullable = true)
 |-- weathersit: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- hum: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- cnt: double (nullable = true)

hour_df.show(5)

+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|season|mnth| hr|holiday|weekday|workingday|weathersit|temp| atemp| hum|windspeed| cnt|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
|   1.0| 1.0|0.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.81|      0.0|16.0|
|   1.0| 1.0|1.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|40.0|
|   1.0| 1.0|2.0|    0.0|    6.0|       0.0|       1.0|0.22|0.2727| 0.8|      0.0|32.0|
|   1.0| 1.0|3.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0|13.0|
|   1.0| 1.0|4.0|    0.0|    6.0|       0.0|       1.0|0.24|0.2879|0.75|      0.0| 1.0|
+------+----+---+-------+-------+----------+----------+----+------+----+---------+----+
only showing top 5 rows

》划分训练集和测试集

train_df, test_df = hour_df.randomSplit(weights=[0.7, 0.3], seed=666)
train_df.cache()
test_df.cache()

》建立的pipeline

#使用VectorIndexer 使月份，星期， 小时转换成类别
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer, VectorAssembler, StringIndexer
from pyspark.ml.regression import DecisionTreeRegressor
featuresCols = hour_df.columns[:-1]
featuresCols

vector = VectorAssembler(inputCols=featuresCols, outputCol="va_features")
vIndexer = VectorIndexer(inputCol="va_features", outputCol="features", maxCategories=24)
dt = DecisionTreeRegressor(featuresCol="features", labelCol="cnt", seed=1024)
dt_pipeline = Pipeline(stages=[vector, vIndexer, dt])
dt_pipeline.getStages()

dtModel = dt_pipeline.fit(train_df)
prediction = dtModel.transform(test_df)
prediction.columns

prediction.select("cnt", "prediction").show(5)

+----+-----------------+
| cnt|       prediction|
+----+-----------------+
|39.0|           59.575|
|10.0|38.08888888888889|
|12.0|38.08888888888889|
|13.0|38.08888888888889|
|16.0|38.08888888888889|
+----+-----------------+
only showing top 5 rows

#预测的很差

》对模型进行评估

from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(labelCol="cnt", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(prediction)
rmse # 93.2708

#找出最佳模型参数
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

paramGrid = ParamGridBuilder().addGrid(dt.maxDepth, [5, 10, 15, 25]).addGrid(dt.maxBins, [25, 35, 45, 50]).build() # bins 必须要大于 24
tvs = TrainValidationSplit(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, trainRatio=0.8)
tvs_pipeline = Pipeline(stages=[vector, vIndexer, tvs])
tvs_model = tvs_pipeline.fit(train_df)
tvs_prediction = tvs_model.transform(test_df)
tsv_rmse = evaluator.evaluate(tvs_prediction)
tsv_rmse # 82.3553

#使用交叉验证
from pyspark.ml.tuning import CrossValidator
cv = CrossValidator(estimator=dt, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=3)
cv_pipeline = Pipeline(stages = [vector, vIndexer, cv])
cv_model = cv_pipeline.fit(train_df)
cv_prediction = cv_model.transform(test_df)
cv_rmse = evaluator.evaluate(cv_prediction)
cv_rmse    #82.483

》使用梯度提升树（GBDT）进行训练

from pyspark.ml.regression import GBTRegressor
gbdt = GBTRegressor(labelCol="cnt", featuresCol="features")
gbdt_pipeline = Pipeline(stages=[vector, vIndexer, gbdt])
gbdt_model = gbdt_pipeline.fit(train_df)
gbdt_prediction = gbdt_model.transform(test_df)
gbdt_rmse = evaluator.evaluate(gbdt_prediction)
gbdt_rmse    # 75.863

#使用交叉验证找出最佳的参数
paramGrid = ParamGridBuilder().addGrid(gbdt.maxDepth, [5, 10]).addGrid(gbdt.maxBins, [25, 40]).addGrid(gbdt.maxIter, [10, 50]).build()
cross = CrossValidator(estimator=gbdt, evaluator=evaluator, numFolds=3, estimatorParamMaps=paramGrid, seed=1024)
gbdt_cross = Pipeline(stages=[vector, vIndexer, cross])

gbdt_model = gbdt_cross.fit(train_df)
pred = gbdt_model.transform(test_df)
rmse = evaluator.evaluate(pred)
rmse #  70

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。