1. 创建spark-session
import os
import time
from pyspark import SparkConf, SparkContext


# 设置环境变量
os.environ['JAVA_HOME'] = 'C:\Java\jdk1.8.0_91'
# HADOOP在Windows的兼容性问题 主要需要$HADOOP_HOME/lib中winutils.exe等文件
os.environ['HADOOP_HOME'] = 'C:\Java\hadoop-2.6.0-cdh5.7.6'
# 设置SPARK_HOME环境变量, 非常重要, 如果没有设置的话,SparkApplication运行不了
os.environ['SPARK_HOME'] = 'C:\Java\spark-2.2.0-bin-2.6.0-cdh5.7.6'

sparkConf = SparkConf()\ # Create SparkConf
.setAppName('Python_Spark_Regression')\
.setMaster('local[4]')

sc = SparkContext(conf=sparkConf) # Create SparkContext
2. 读取数据—格式化

数据字段信息:
​​​instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday, weathersit,temp,atemp,hum,windspeed,casual,registered,cnt​

# 读取数据
raw_data_with_header = sc.textFile('./datas/hour.csv')

# 数据初步了解
print (raw_data_with_header.count())
raw_data_with_header.take(2)
header = raw_data_with_header.first()

# 过滤数据,格式化拆分
raw_data = raw_data_with_header.filter(lambda line: line != header)
datas_rdd = raw_data.map(lambda x: x.split(","))
3. 特征工程—数据清洗
# 提取 标签字段---转换 字段的值为 ? 转换为 0,转换为数值类型
def extract_label(fields):
label = fields[-1]
return float(label)
def conver_float(x):
return (0 if x == "?" else float(x))

import numpy as np
from pyspark.mllib.regression import LabeledPoint
"""
获取季节特征 字段的数据
数值字段
返回 类别 特征 + 数值特征
"""
def extract_features(fields, end_index):
feature_season = [conver_float(fields[2])]
numeric_features = [ conver_float(column) for column in fields[4: end_index]]
# print (numeric_features)
return np.concatenate((feature_season, numeric_features))

extract_features(datas_rdd.first(), -3)

# 提取特征
lps_rdd = datas_rdd.map(lambda fields: LabeledPoint(
extract_label(fields), extract_features(fields, -3)))

# 以随机方式将数据集 划分为训练集、验证集、测试集,划分比例: 8\1\1
train_rdd, validation_rdd, test_rdd = lps_rdd.randomSplit([8, 1, 1])

print (train_rdd.cache().count())
print (validation_rdd.cache().count())
print (test_rdd.persist().count())
4. 定义模型评估函数
from pyspark.mllib.evaluation import RegressionMetrics

#定义模型评估函数
def evaluate_mode(model, validation_data):
# 基于训练的模型和验证模型及 评估
score = model.predict(validation_data.map(lambda lp: lp.features)).map(lambda x: float(x))

# 组合预测的值和实际真实的值
score_and_label = score.zip(validation_data.map(lambda lp: lp.label))

# 使用BinaryClassificationMetrics 计算AUC面积
metrics = RegressionMetrics(score_and_label)

# 其他指标
# metrics.explainedVariance
# metrics.meanAbsoluteError
# metrics.meanSquaredError

rmse = metrics.rootMeanSquaredError
return
2、定义模型训练和评估函数
from pyspark.mllib.tree import DecisionTree

"""
构建一个函数,实现模型的 训练和评估功能,并且计算评估所需的时间
"""
from time import time

def train_evaluate_model(train_data, validation_data, param_depth, param_bins):
start_time = time() # 模型训练开始时间

model = DecisionTree.trainRegressor( # 训练模型
train_rdd, {},
impurity='variance',
maxDepth=param_depth,
maxBins=param_bins)

rmse = evaluate_mode(model, validation_data) # 模型验证评估

duration = time() - start_time # 计算花费时间
print ("训练评估使用参数:impurity = variance" + \
", maxDepth = " + str(param_depth) + \
", maxBins = " + str(param_bins) + \
" => 所需时间 = " + str(duration) + ", 评估RMSE = " + str(rmse))

return (rmse, duration,'variance', param_depth, param_bins, model)

rmse, duration, impurity, depth, bins, model = train_evaluate_model(
train_rdd,
validation_rdd, 10, 32)
3. 使用多个超参数,选择最佳模型
def train_evaluate_params(train_datas, validation_datas, depth_list, bins_list):
# 训练及评估返回值
metrics_list = [ train_evaluate_model(train_rdd, validation_rdd, depth, bins)
for depth in depth_list
for bins in bins_list
]

# 针对 auc值降序排序,找出最佳 模型
sorted_metrics_list = sorted(metrics_list, key=lambda k: k[0], reverse=False)

# 获取最佳模型
best_params = sorted_metrics_list[0]

# 打印显示,最佳参数组合
print("最佳参数组合: impurity -> variance" + \
", depth -> " + str(best_params[3]) + \
", bins -> " + str(best_params[4]) + \
"\n RMSE -> " + str(best_params[0])
)

# 返回模型
return best_params[5]
best_model = train_evaluate_params(train_rdd,
validation_rdd,
[5, 10, 20],
[16, 32, 64])