一、摘要
该数据集来自kaggle,数据集包含了直接影响学生成绩的原因,本选题应用Python网络爬虫方法。
二、选题背景:
影响学生学习成绩的因素很多,但就学生本身来说,对学习成绩起决定作用的,主要是学生学习.的心理状态、智能水平、学习方法和学习时间等四个方面的因素。本书根据这四个方面的因素和中学生的学习特点,以方法为线索,从提高学生的认识水平着手,帮助学生提高心理素质、促进智能发展、改进学习方法和科学安排时间。
三、过程及代码:
1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 from matplotlib.font_manager import FontProperties
6 from sklearn.linear_model import LinearRegression
7 from sklearn.linear_model import ElasticNet
8 from sklearn.ensemble import RandomForestRegressor
9 from sklearn.ensemble import ExtraTreesRegressor
10 from sklearn.ensemble import GradientBoostingRegressor
11 from sklearn.svm import SVR
12 from sklearn.model_selection import train_test_split
13 from sklearn.preprocessing import MinMaxScaler
14 from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
15 import scipy
16 import pickle
17
18
19 # 初始化数据
20 plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文字体设置-黑体
21 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
22 sns.set(font='SimHei') # 解决Seaborn中文显示问题
23 student = pd.read_csv('student-mat.csv')
24 # print(student.head())
25
26 # 分析G3数据属性
27 # print(student['G3'].describe())1 # 根据人数多少统计各分数段的学生人数
2 grade_counts = student['G3'].value_counts().sort_values().plot.barh(width=.9,color=sns.color_palette('inferno',40))
3 grade_counts.axes.set_title('各分数值的学生分布',fontsize=30)
4 grade_counts.set_xlabel('学生数量', fontsize=30)
5 grade_counts.set_ylabel('最终成绩', fontsize=30)
6 plt.show()
1 # 从低到高展示成绩分布图
2 grade_distribution = sns.countplot(student['G3'])
3 grade_distribution.set_title('成绩分布图', fontsize=30)
4 grade_distribution.set_xlabel('期末成绩', fontsize=20)
5 grade_distribution.set_ylabel('人数统计', fontsize=20)
6 plt.show()
7
8 # 检查各个列是否有null值,如果没有表示成绩中的0分确实是0分
9 # print(student.isnull().any())
1 # 分析性别比例
2 male_studs = len(student[student['sex'] == 'M'])
3 female_studs = len(student[student['sex'] == 'F'])
4 print('男同学数量:',male_studs)
5 print('女同学数量:',female_studs)1 # 分析年龄分布比例(曲线图)
2 age_distribution = sns.kdeplot(student['age'], shade=True)
3 age_distribution.axes.set_title('学生年龄分布图', fontsize=30)
4 age_distribution.set_xlabel('年龄', fontsize=20)
5 age_distribution.set_ylabel('比例', fontsize=20)
6 plt.show()
1 # 分性别年龄分布图(柱状图)
2 age_distribution_sex = sns.countplot('age', hue='sex', data=student)
3 age_distribution_sex.axes.set_title('不同年龄段的学生人数', fontsize=30)
4 age_distribution_sex.set_xlabel('年龄', fontsize=30)
5 age_distribution_sex.set_ylabel('人数', fontsize=30)
6 plt.show()
1 # 各年龄段的成绩箱型图
2 age_grade_boxplot = sns.boxplot(x='age', y='G3', data=student)
3 age_grade_boxplot.axes.set_title('年龄与分数', fontsize = 30)
4 age_grade_boxplot.set_xlabel('年龄', fontsize = 20)
5 age_grade_boxplot.set_ylabel('分数', fontsize = 20)
6 plt.show()
1 # 各年龄段的成绩分布图
2 age_grade_swarmplot = sns.swarmplot(x='age', y='G3', data=student)
3 age_grade_swarmplot.axes.set_title('年龄与分数', fontsize = 30)
4 age_grade_swarmplot.set_xlabel('年龄', fontsize = 20)
5 age_grade_swarmplot.set_ylabel('分数', fontsize = 20)
6 plt.show()
1 # 城乡学生计数
2 areas_countplot = sns.countplot(student['address'])
3 areas_countplot.axes.set_title('城乡学生', fontsize = 30)
4 areas_countplot.set_xlabel('家庭住址', fontsize = 20)
5 areas_countplot.set_ylabel('计数', fontsize = 20)
6 plt.show()
1 # Grade distribution by address
2 sns.kdeplot(student.loc[student['address'] == 'U', 'G3'], label='Urban', shade = True)
3 sns.kdeplot(student.loc[student['address'] == 'R', 'G3'], label='Rural', shade = True)
4 plt.title('城市学生获得了更好的成绩吗?', fontsize = 20)
5 plt.xlabel('分数', fontsize = 20)
6 plt.ylabel('占比', fontsize = 20)
7 plt.show()
8
9 # 选取G3属性值
10 labels = student['G3']
11
12 # 删除school,G1和G2属性
13 student = student.drop(['school', 'G1', 'G2'], axis='columns')
14
15 # 对离散变量进行独热编码
16 student = pd.get_dummies(student)
17
18 # 选取相关性最强的8个
19 most_correlated = student.corr().abs()['G3'].sort_values(ascending=False)
20 most_correlated = most_correlated[:9]
21 print(most_correlated)
1 # 失败次数成绩分布图
2 failures_swarmplot = sns.swarmplot(x=student['failures'],y=student['G3'])
3 failures_swarmplot.axes.set_title('失败次数少的学生分数更高吗?', fontsize = 30)
4 failures_swarmplot.set_xlabel('失败次数', fontsize = 20)
5 failures_swarmplot.set_ylabel('最终成绩', fontsize = 20)
6 plt.show()
1 # 双亲受教育水平的影响
2 family_ed = student['Fedu'] + student['Medu']
3 family_ed_boxplot = sns.boxplot(x=family_ed,y=student['G3'])
4 family_ed_boxplot.axes.set_title('双亲受教育水平的影响', fontsize = 30)
5 family_ed_boxplot.set_xlabel('家庭教育水平(Mother + Father)', fontsize = 20)
6 family_ed_boxplot.set_ylabel('最终成绩', fontsize = 20)
7 plt.show()
1 # 学生自己的升学意志对成绩的影响
2 personal_wish = sns.boxplot(x = student['higher_yes'], y=student['G3'])
3 personal_wish.axes.set_title('学生升学意愿对成绩的影响', fontsize = 30)
4 personal_wish.set_xlabel('更高级的教育 (1 = 是)', fontsize = 20)
5 personal_wish.set_ylabel('最终成绩', fontsize = 20)
6 plt.show()
1 # 分割数据集
2 X_train, X_test, y_train, y_test = train_test_split(student, labels, test_size = 0.25, random_state=42)
3
4 # 计算平均绝对误差和均方根误差
5 # MAE-平均绝对误差
6 # RMSE-均方根误差
7 def evaluate_predictions(predictions, true):
8 mae = np.mean(abs(predictions - true))
9 rmse = np.sqrt(np.mean((predictions - true) ** 2))
10
11 return mae, rmse
12
13 # 求中位数
14 median_pred = X_train['G3'].median()
15
16 # 所有中位数的列表
17 median_preds = [median_pred for _ in range(len(X_test))]
18
19 # 存储真实的G3值以传递给函数
20 true = X_test['G3']
21
22 # 展示基准
23 mb_mae, mb_rmse = evaluate_predictions(median_preds, true)
24 print('Median Baseline MAE: {:.4f}'.format(mb_mae))
25 print('Median Baseline RMSE: {:.4f}'.format(mb_rmse))
26
27 # 通过训练集训练和测试集测试来生成多个线性模型
28 def evaluate(X_train, X_test, y_train, y_test):
29 # 模型名称
30 model_name_list = ['Linear Regression', 'ElasticNet Regression',
31 'Random Forest', 'Extra Trees', 'SVM',
32 'Gradient Boosted', 'Baseline']
33 X_train = X_train.drop('G3', axis='columns')
34 X_test = X_test.drop('G3', axis='columns')
35
36 # 实例化模型
37 model1 = LinearRegression()
38 model2 = ElasticNet(alpha=1.0, l1_ratio=0.5)
39 model3 = RandomForestRegressor(n_estimators=100)
40 model4 = ExtraTreesRegressor(n_estimators=100)
41 model5 = SVR(kernel='rbf', degree=3, C=1.0, gamma='auto')
42 model6 = GradientBoostingRegressor(n_estimators=50)
43
44 # 结果数据框
45 results = pd.DataFrame(columns=['mae', 'rmse'], index = model_name_list)
46
47 # 每种模型的训练和预测
48 for i, model in enumerate([model1, model2, model3, model4, model5, model6]):
49 model.fit(X_train, y_train)
50 predictions = model.predict(X_test)
51
52 # 误差标准
53 mae = np.mean(abs(predictions - y_test))
54 rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
55
56 # 将结果插入结果框
57 model_name = model_name_list[i]
58 results.loc[model_name, :] = [mae, rmse]
59
60 # 中值基准度量
61 baseline = np.median(y_train)
62 baseline_mae = np.mean(abs(baseline - y_test))
63 baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))
64
65 results.loc['Baseline', :] = [baseline_mae, baseline_rmse]
66
67 return results
68 results = evaluate(X_train, X_test, y_train, y_test)
69 print(results)
70
71 # 找出最合适的模型
72 plt.figure(figsize=(12, 8))
73
74 # 平均绝对误差
75 ax = plt.subplot(1, 2, 1)
76 results.sort_values('mae', ascending = True).plot.bar(y = 'mae', color = 'b', ax = ax, fontsize=20)
77 plt.title('平均绝对误差', fontsize=20)
78 plt.ylabel('MAE', fontsize=20)
79
80 # 均方根误差
81 ax = plt.subplot(1, 2, 2)
82 results.sort_values('rmse', ascending = True).plot.bar(y = 'rmse', color = 'r', ax = ax, fontsize=20)
83 plt.title('均方根误差', fontsize=20)
84 plt.ylabel('RMSE',fontsize=20)
85 plt.tight_layout()
86 plt.show()
87
88 # 保存线性回归模型
89 model = LinearRegression()
90 model.fit(X_train, y_train)
91 filename = 'LR_Model'
92 pickle.dump(model, open(filename, 'wb'))
完整代码:
1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 from matplotlib.font_manager import FontProperties
6 from sklearn.linear_model import LinearRegression
7 from sklearn.linear_model import ElasticNet
8 from sklearn.ensemble import RandomForestRegressor
9 from sklearn.ensemble import ExtraTreesRegressor
10 from sklearn.ensemble import GradientBoostingRegressor
11 from sklearn.svm import SVR
12 from sklearn.model_selection import train_test_split
13 from sklearn.preprocessing import MinMaxScaler
14 from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error
15 import scipy
16 import pickle
17
18
19 # 初始化数据
20 plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文字体设置-黑体
21 plt.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
22 sns.set(font='SimHei') # 解决Seaborn中文显示问题
23 student = pd.read_csv('student-mat.csv')
24
25 # print(student.head())
26
27 # 分析G3数据属性
28 # print(student['G3'].describe())
29
30 # 根据人数多少统计各分数段的学生人数
31 grade_counts = student['G3'].value_counts().sort_values().plot.barh(width=.9,color=sns.color_palette('inferno',40))
32 grade_counts.axes.set_title('各分数值的学生分布',fontsize=30)
33 grade_counts.set_xlabel('学生数量', fontsize=30)
34 grade_counts.set_ylabel('最终成绩', fontsize=30)
35 plt.show()
36
37 # 从低到高展示成绩分布图
38 grade_distribution = sns.countplot(student['G3'])
39 grade_distribution.set_title('成绩分布图', fontsize=30)
40 grade_distribution.set_xlabel('期末成绩', fontsize=20)
41 grade_distribution.set_ylabel('人数统计', fontsize=20)
42 plt.show()
43
44 # 检查各个列是否有null值,如果没有表示成绩中的0分确实是0分
45
46 # print(student.isnull().any())
47
48 # 分析性别比例
49 male_studs = len(student[student['sex'] == 'M'])
50 female_studs = len(student[student['sex'] == 'F'])
51 print('男同学数量:',male_studs)
52 print('女同学数量:',female_studs)
53
54 # 分析年龄分布比例(曲线图)
55 age_distribution = sns.kdeplot(student['age'], shade=True)
56 age_distribution.axes.set_title('学生年龄分布图', fontsize=30)
57 age_distribution.set_xlabel('年龄', fontsize=20)
58 age_distribution.set_ylabel('比例', fontsize=20)
59 plt.show()
60
61 # 分性别年龄分布图(柱状图)
62 age_distribution_sex = sns.countplot('age', hue='sex', data=student)
63 age_distribution_sex.axes.set_title('不同年龄段的学生人数', fontsize=30)
64 age_distribution_sex.set_xlabel('年龄', fontsize=30)
65 age_distribution_sex.set_ylabel('人数', fontsize=30)
66 plt.show()
67
68 # 各年龄段的成绩箱型图
69 age_grade_boxplot = sns.boxplot(x='age', y='G3', data=student)
70 age_grade_boxplot.axes.set_title('年龄与分数', fontsize = 30)
71 age_grade_boxplot.set_xlabel('年龄', fontsize = 20)
72 age_grade_boxplot.set_ylabel('分数', fontsize = 20)
73 plt.show()
74
75 # 各年龄段的成绩分布图
76 age_grade_swarmplot = sns.swarmplot(x='age', y='G3', data=student)
77 age_grade_swarmplot.axes.set_title('年龄与分数', fontsize = 30)
78 age_grade_swarmplot.set_xlabel('年龄', fontsize = 20)
79 age_grade_swarmplot.set_ylabel('分数', fontsize = 20)
80 plt.show()
81
82 # 城乡学生计数
83 areas_countplot = sns.countplot(student['address'])
84 areas_countplot.axes.set_title('城乡学生', fontsize = 30)
85 areas_countplot.set_xlabel('家庭住址', fontsize = 20)
86 areas_countplot.set_ylabel('计数', fontsize = 20)
87 plt.show()
88
89 # Grade distribution by address
90 sns.kdeplot(student.loc[student['address'] == 'U', 'G3'], label='Urban', shade = True)
91 sns.kdeplot(student.loc[student['address'] == 'R', 'G3'], label='Rural', shade = True)
92 plt.title('城市学生获得了更好的成绩吗?', fontsize = 20)
93 plt.xlabel('分数', fontsize = 20)
94 plt.ylabel('占比', fontsize = 20)
95 plt.show()
96
97 # 选取G3属性值
98 labels = student['G3']
99
100 # 删除school,G1和G2属性
101 student = student.drop(['school', 'G1', 'G2'], axis='columns')
102
103 # 对离散变量进行独热编码
104 student = pd.get_dummies(student)
105
106 # 选取相关性最强的8个
107 most_correlated = student.corr().abs()['G3'].sort_values(ascending=False)
108 most_correlated = most_correlated[:9]
109 print(most_correlated)
110
111 # 失败次数成绩分布图
112 failures_swarmplot = sns.swarmplot(x=student['failures'],y=student['G3'])
113 failures_swarmplot.axes.set_title('失败次数少的学生分数更高吗?', fontsize = 30)
114 failures_swarmplot.set_xlabel('失败次数', fontsize = 20)
115 failures_swarmplot.set_ylabel('最终成绩', fontsize = 20)
116 plt.show()
117
118 # 双亲受教育水平的影响
119 family_ed = student['Fedu'] + student['Medu']
120 family_ed_boxplot = sns.boxplot(x=family_ed,y=student['G3'])
121 family_ed_boxplot.axes.set_title('双亲受教育水平的影响', fontsize = 30)
122 family_ed_boxplot.set_xlabel('家庭教育水平(Mother + Father)', fontsize = 20)
123 family_ed_boxplot.set_ylabel('最终成绩', fontsize = 20)
124 plt.show()
125
126 # 学生自己的升学意志对成绩的影响
127 personal_wish = sns.boxplot(x = student['higher_yes'], y=student['G3'])
128 personal_wish.axes.set_title('学生升学意愿对成绩的影响', fontsize = 30)
129 personal_wish.set_xlabel('更高级的教育 (1 = 是)', fontsize = 20)
130 personal_wish.set_ylabel('最终成绩', fontsize = 20)
131 plt.show()
132
133 # 分割数据集
134 X_train, X_test, y_train, y_test = train_test_split(student, labels, test_size = 0.25, random_state=42)
135
136 # 计算平均绝对误差和均方根误差
137
138 # MAE-平均绝对误差
139
140 # RMSE-均方根误差
141 def evaluate_predictions(predictions, true):
142 mae = np.mean(abs(predictions - true))
143 rmse = np.sqrt(np.mean((predictions - true) ** 2))
144
145 return mae, rmse
146
147 # 求中位数
148 median_pred = X_train['G3'].median()
149
150 # 所有中位数的列表
151 median_preds = [median_pred for _ in range(len(X_test))]
152
153 # 存储真实的G3值以传递给函数
154 true = X_test['G3']
155
156 # 展示基准
157 mb_mae, mb_rmse = evaluate_predictions(median_preds, true)
158 print('Median Baseline MAE: {:.4f}'.format(mb_mae))
159 print('Median Baseline RMSE: {:.4f}'.format(mb_rmse))
160
161 # 通过训练集训练和测试集测试来生成多个线性模型
162 def evaluate(X_train, X_test, y_train, y_test):
163
164 # 模型名称
165 model_name_list = ['Linear Regression', 'ElasticNet Regression',
166 'Random Forest', 'Extra Trees', 'SVM',
167 'Gradient Boosted', 'Baseline']
168 X_train = X_train.drop('G3', axis='columns')
169 X_test = X_test.drop('G3', axis='columns')
170
171 # 实例化模型
172 model1 = LinearRegression()
173 model2 = ElasticNet(alpha=1.0, l1_ratio=0.5)
174 model3 = RandomForestRegressor(n_estimators=100)
175 model4 = ExtraTreesRegressor(n_estimators=100)
176 model5 = SVR(kernel='rbf', degree=3, C=1.0, gamma='auto')
177 model6 = GradientBoostingRegressor(n_estimators=50)
178
179 # 结果数据框
180 results = pd.DataFrame(columns=['mae', 'rmse'], index = model_name_list)
181
182 # 每种模型的训练和预测
183 for i, model in enumerate([model1, model2, model3, model4, model5, model6]):
184 model.fit(X_train, y_train)
185 predictions = model.predict(X_test)
186
187 # 误差标准
188 mae = np.mean(abs(predictions - y_test))
189 rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
190
191 # 将结果插入结果框
192 model_name = model_name_list[i]
193 results.loc[model_name, :] = [mae, rmse]
194
195 # 中值基准度量
196 baseline = np.median(y_train)
197 baseline_mae = np.mean(abs(baseline - y_test))
198 baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))
199
200 results.loc['Baseline', :] = [baseline_mae, baseline_rmse]
201
202 return results
203 results = evaluate(X_train, X_test, y_train, y_test)
204 print(results)
205
206 # 找出最合适的模型
207 plt.figure(figsize=(12, 8))
208
209 # 平均绝对误差
210 ax = plt.subplot(1, 2, 1)
211 results.sort_values('mae', ascending = True).plot.bar(y = 'mae', color = 'b', ax = ax, fontsize=20)
212 plt.title('平均绝对误差', fontsize=20)
213 plt.ylabel('MAE', fontsize=20)
214
215 # 均方根误差
216 ax = plt.subplot(1, 2, 2)
217 results.sort_values('rmse', ascending = True).plot.bar(y = 'rmse', color = 'r', ax = ax, fontsize=20)
218 plt.title('均方根误差', fontsize=20)
219 plt.ylabel('RMSE',fontsize=20)
220 plt.tight_layout()
221 plt.show()
222
223 # 保存线性回归模型
224 model = LinearRegression()
225 model.fit(X_train, y_train)
226 filename = 'LR_Model'
227 pickle.dump(model, open(filename, 'wb'))四、总结:
通过以上分析,可以初步得出以下的结论:
1.双亲受教育水平会影响孩子成绩,父母学历越高,学生成绩越好
2.学生升学意愿会影响学生自身成绩
3.考试准备充分的同学成绩较高
















