一. 查看每列的数据结构

def print_col_info(dataset):
'''print info of every column in dataset:
detailed info includes:
1, values
2, value type num'''
col_num=dataset.shape[1]
for i in range(col_num):
print('\ncol-{} info: '.format(i))
temp=np.sort(list(set(dataset[:,i])))
print('values: {}'.format(temp))
print('values num: {}'.format(temp.shape[0]))

二. 通过map函数对离散值的字符串统一去掉空格

# 具有离散值列的索引list
str_cols=[1,3,5,6,7,8,9,13,14]
for col in str_cols:
df.iloc[:,col]=df.iloc[:,col].map(lambda x: x.strip())

map函数的用法:接受函数为参数,或把函数作为返回结果的函数

def square(x): return x * x
xx = map(square, range(10))
xx = list(xx)

三. 查看数据是否有缺失值

# 查看数据是否有缺失值--任意列,全部列
df.isnull().any()
df.isnull().all()

四. 将某固定值替换为nan,再删除nan数据

# 无返回值,直接替换
# 将?字符串替换为NaN缺失值标志
df.replace("?",np.nan,inplace=True)
# 此处直接删除缺失值样本
df.dropna(inplace=True)

将每列的0替换为None

# 先检查是否有'真正的空值'
pima['serum_insulin'].isnull().sum()
# 用None来手动的替换0
pima['serum_insulin'] = pima['serum_insulin'].map(lambda x:x if x != 0 else None)
# 再次检查缺失值数量
pima['serum_insulin'].isnull().sum()

五. 对离散型数据进行编码

from sklearn import preprocessing
label_encoder=[] # 放置每一列的encoder
encoded_set = np.empty(df.shape)
for col in range(df.shape[1]):
encoder=None
if df.iloc[:,col].dtype==object: # 字符型数据
encoder=preprocessing.LabelEncoder()
encoded_set[:,col]=encoder.fit_transform(df.iloc[:,col])
else: # 数值型数据
encoded_set[:,col]=df.iloc[:,col]
label_encoder.append(encoder)

六. 通过describe查看连续型数据的min和max值,对相差较大的列进行范围缩放

df.describe()

cols=[2,10,11]
data_scalers=[] # 专门用来放置scaler
for col in cols:
data_scaler=preprocessing.MinMaxScaler(feature_range=(-1,1))
encoded_set[:,col]=np.ravel(data_scaler.fit_transform(encoded_set[:,col].reshape(-1,1)))
data_scalers.append(data_scaler)

七. 对传入的模型计算准确度、精确度、召回率和F1 score;

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

def score_cal(model, test_X, test_y):
num_validations=5
accuracy=cross_val_score(model,test_X,test_y,scoring='accuracy',cv=num_validations)
print('准确率:{:.2f}%'.format(accuracy.mean()*100))
precision=cross_val_score(model,test_X,test_y,scoring='precision_weighted',cv=num_validations)
print('精确度:{:.2f}%'.format(precision.mean()*100))
recall=cross_val_score(model,test_X,test_y,scoring='recall_weighted',cv=num_validations)
print('召回率:{:.2f}%'.format(recall.mean()*100))
f1=cross_val_score(model,test_X,test_y,scoring='f1_weighted',cv=num_validations)
print('F1 值:{:.2f}%'.format(f1.mean()*100))
# 3 打印性能报告
y_pred=model.predict(test_X)
confusion_mat = confusion_matrix(test_y, y_pred)
print(confusion_mat) #看看混淆矩阵长啥样
# 直接使用sklearn打印精度,召回率和F1值
target_names = ['<=50K', '>50K']
print(classification_report(test_y, y_pred,target_names=target_names))

八. sklearn随机分离测试集和训练集

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 666)

九.sklearn的网格搜索Grid Search

## knn的网格搜索

param_grid = {
{
'weights':['uniform'],
'n_neighbors':[i for i in range(1, 11)]
},
{
'weights':['distance'],
'n_neighbors':[i for i in range(1, 11)],
'p':[i for i in range(1,6)]
}
}

十.定义函数来搜索所有给定的参数,通过指标来优化机器学习流水线

# 导入网络搜索模块
from sklearn.model_selection import GridSearchCV

def get_best_model_accuracy(model, params, X, y):
grid = GridSearchCV(model, # 要搜索的模型
params, # 要尝试的参数
error_score=0.) # 如果报错,结果为0
grid.fit(X, y) # 拟合模型和参数
# 经典的性能指标
print("Best Accuracy: {}".format(grid.best_score_))
# 得到最佳准确率的最佳参数
print("Best Parameters: {}".format(grid.best_params_))
# 拟合的平均时间(秒)
print("Average Time to Fit(s): {}".format(grid.cv_results_['mean_fit_time'].mean(), 3))
# 预测的平均时间(秒)
# 从该指标可以看出模型在真实世界的性能
print("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

十一. Dataframe遍历对每行值进行改变

def Ticket_First_Let(x):
return x[0]
X_train['Ticket_First_Letter'] = X_train['Ticket'].apply(Ticket_First_Let)
X_test['Ticket_First_Letter'] = X_test['Ticket'].apply(Ticket_First_Let)