1、为什么要进行特征选择

目前我的理解是,能够简化模型,起到降维的作用。

2、如何进行特征选择

其实很简单,将数据输入到模型里面,然后通过通过sklearn提供的feature_selection的SelectFromModel选择有效特征

2.1 svm,lassou,LR

from sklearn.svm import LinearSVC
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression as LR,Lasso,LassoCV
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

#数据我使用的是2021年华数杯附件一
df = pd.read_excel('2.xlsx')
df = pd.DataFrame(df)
#这里的B7列含有空值,先简单粗暴的处理一下吧
df = df.drop(columns='B7')

#从第二列开始到倒数第二列
x = df.iloc[:,2:-1]

#只要最后一列
Y = df.iloc[:,-1]
X = MinMaxScaler().fit_transform(x)

#SVM
lsvc = LinearSVC(C=3,penalty='l1',dual=False)
sfm = SelectFromModel(lsvc,max_features=8).fit(X,Y)
print(sfm)
print(sfm.get_support())
print(df.columns[2:-1][sfm.get_support()])


#lassou回归模型
clf = LassoCV()
sfm=SelectFromModel(clf,max_features=8).fit(X,Y)
print(sfm)
print(sfm.get_support())
print(df.columns[2:-1][sfm.get_support()])

#LR逻辑回归模型
LR = LR(C=7)
sfm = SelectFromModel(estimator=LR,max_features=8).fit(X,Y)
print(sfm)
print(sfm.get_support())
print(df.columns[2:-1][sfm.get_support()])

2.2 随机森林

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

sns.set(style='darkgrid',palette='deep')
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['savefig.dpi'] = 300 #图片像素
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
df = pd.read_excel('2.xlsx')
df1 = df[df.品牌类型==1]
df2 = df[df.品牌类型==2]
df3 = df[df.品牌类型==3]


X = df3.iloc[:,2:-1]
y = df3.iloc[:,-1]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

feat_labels = df.columns[2:-1]
forest = RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1).fit(X_train,y_train)

#特征重要性
I = []
importance = forest.feature_importances_
indices = np.argsort(-importance)
print(indices)

for i in range(X_train.shape[1]):
print('(%d) %s %f'%(i+1,feat_labels[indices[i]],importance[indices[i]]))
I.append((feat_labels[indices[i]],importance[indices[i]]))

I = pd.DataFrame(I)