python Fisher score 特征选择 fisher判别python

转载

mob64ca1403c772 2024-08-10 07:36:02

文章标签 python 机器学习开发语言判别模型频域 文章分类 Python 后端开发

文章目录

1. 概述
2. 相关原理

2.1 Fish_score
2.2 线性判别模型和二次判别模型

3. 代码实现过程
4. Tips

1. 概述

本章节通过分析轴承振动数据得到了系列信号特征，采用fisher_score方法，获取了两个敏感特征，最后采用线性判别模型及二次判别模型获取了故障尺寸的分类模型。

2. 相关原理

2.1 Fish_score

详见Pyhon在振动信号处理中的高级应用（十）：监督式特征的选择方法（Fisher Score、ReliefF）

2.2 线性判别模型和二次判别模型

可参考线性和二次判别分析、LDA（线性判别分析）。

3. 代码实现过程

第一步：在相同转速下，分析不同内圈故障尺寸下原始信号时域和频域特征，以及包络信号频域特征，特征的计算方法

from scipy.io import loadmat
import numpy as np
from scipy import signal, fftpack, stats
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn import discriminant_analysis
from sklearn.metrics import accuracy_score

plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.size'] = '12'
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams["axes.unicode_minus"] = False

# 从.mat文件中解析数据
def get_data(source):
    data_DE = []
    data_FE = []
    data_BA = []
    data_Speed = []
    
    
    for i, key in enumerate(source.keys()):
        if i == 3:
            data_DE = source[key].flatten()
        elif i == 4:
            data_FE = source[key].flatten()
        elif i == 5:
            if len(source.keys()) == 6:
                data_Speed = source[key].flatten()
            else:
                data_BA = source[key].flatten()
        elif i == 6:
            data_Speed = source[key].flatten()
                
    return data_DE, data_FE, data_BA, data_Speed

def envelope(sig, fs, low_pass):
    # 信号解包络
    sig_envelope = sig ** 2
    # 求取包络信号的FFT变换结果
    sig_fft = fftpack.fft(sig_envelope)
    # 求取包络信号的FFT变换频率
    sig_fre = fftpack.fftfreq(len(sig_fft), d=1/fs)
    # 低通滤波器
    sig_fft[abs(sig_fre)>low_pass] = 0
    # 转换为滤波后时域信号
    sig_filter = fftpack.ifft(sig_fft).real
    # 求取包络曲线的FFT结果
    fre_sig, mag_sig = signal.welch(sig_filter, fs=fs, nfft=len(sig_filter), 
                                    nperseg=len(sig_filter), scaling = 'spectrum')
    mag_sig = mag_sig[fre_sig<=low_pass]
    fre_sig = fre_sig[fre_sig<=low_pass]
    mag_sig = np.sqrt(2*mag_sig)
    return fre_sig, mag_sig

def cal_feature(source_path, source_list, types, speeds, aeras, fs, resample=False):
    feature_list = []
    type_list = []
    speed_list = []
    aera_list = []
    for _type, _speed, _aera, _list in zip(types, speeds, aeras, source_list):

        # 获取待处理数据
        source = loadmat(source_path + str(_list) + '.mat')
        # 解析数据
        data_DE, data_FE, data_BA, data_Speed = get_data(source)
        # 重采样
        if resample:
            data_DE = data_DE[::4]
        # 定义切片长度
        split_N = fs
        # 获取信号特征
        for i in range(len(data_DE)//split_N):
            # 切分数据
            data = data_DE[i*split_N:(i+1)*split_N]
            # 原始信号特征提取
            # 获取数据时域特征
            feature_time1 = get_time_features(data)
            # 获取数据频域特征
            fre_sig, mag_sig = signal.welch(data, fs=fs, nfft=split_N, 
                                            nperseg=split_N, scaling='spectrum')
            mag_sig = np.sqrt(2*mag_sig)
            feature_fre1 = get_frequence_features(fre_sig, mag_sig)
            # 获取包络谱频域特征
            fre_sig, mag_sig = envelope(data, fs, low_pass=200)
            feature_fre2 = get_frequence_features(fre_sig, mag_sig)
            # 特征组合
            feature = np.concatenate((feature_time1, feature_fre1, feature_fre2), axis=0)
            feature_list.append(list(feature))
            type_list.append(_type)
            speed_list.append(_speed)
            aera_list.append(_aera)
    # 构建特征Dataframe
    columns = []
    for i in range(len(feature_time1)):
        columns.append('feature_time1-'+str(i+1))
    for i in range(len(feature_fre1)):
        columns.append('feature_fre1-'+str(i+12))
    for i in range(len(feature_fre2)):
        columns.append('feature_fre2-'+str(i+12))
    df = pd.DataFrame(feature_list, columns=columns)
    df['types'] = type_list
    df['speed'] = speed_list
    df['aera'] = aera_list
    return df

def get_fisher_score(df, types='types'):
    # 计算各特征平均值
    feature_mean = df.mean()[:-1]
    # 计算各类数据中特征平均值
    df_feature_mean = df.groupby(types).mean()
    # 类别列表
    types_list = df_feature_mean.index
    # 各类别数量
    num = [len(df[df[types]==_types_list]) for _types_list in types_list]
    # 总样本数
    N = df.shape[0]
    # 特征列表
    features_list = df_feature_mean.columns
    # 计算类间方差
    S_B = 0
    for i,_num in enumerate(num):
        S_B += _num / N * (feature_mean - df_feature_mean.iloc[i]) ** 2
    # 计算类内方差
    S_W = 0
    for i, _types_list in enumerate(types_list):
        feature_var = (df[df['types']==_types_list].iloc[:,:-1] - df_feature_mean.iloc[i])**2
        S_W += feature_var.sum() / N
    # 计算fisherscore
    Fisher_Score = S_B / S_W
    return Fisher_Score

# 获取正常数据特征
source_path = './/data/Normal Baseline Data/'
source_list = [97, 98, 99, 100]
types = [0, 0, 0, 0]

speeds = [1797, 1772, 1750, 1730]
aeras = ['Normal', 'Normal', 'Normal', 'Normal']
fs = 12000

df_normal = cal_feature(source_path, source_list, types, speeds, aeras, fs, resample=True)

# 获取电机端内圈故障信号特征
source_path = './/data/12k Drive End Bearing Fault Data/'
source_list = [105, 106, 107, 108, 169, 170, 171, 172, 
               209, 210, 211, 212, 3001, 3002, 3003, 3004,
               118, 119, 120, 121, 185, 186, 187, 188,
               222, 223, 224, 225, 3005, 3006, 3007, 3008,
               130, 131, 132, 133, 197, 198, 199, 200,
               234, 235, 236, 237]

types = [0.18, 0.18, 0.18, 0.18, 0.36, 0.36, 0.36, 0.36,
         0.53, 0.53, 0.53, 0.53, 0.71, 0.71, 0.71, 0.71,
         0.18, 0.18, 0.18, 0.18, 0.36, 0.36, 0.36, 0.36,
         0.53, 0.53, 0.53, 0.53, 0.71, 0.71, 0.71, 0.71,
         0.18, 0.18, 0.18, 0.18, 0.36, 0.36, 0.36, 0.36,
         0.53, 0.53, 0.53, 0.53]

speeds = [1797, 1772, 1750, 1730, 1797, 1772, 1750, 1730,
          1797, 1772, 1750, 1730, 1797, 1772, 1750, 1730,
          1797, 1772, 1750, 1730, 1797, 1772, 1750, 1730,
          1797, 1772, 1750, 1730, 1797, 1772, 1750, 1730,
          1797, 1772, 1750, 1730, 1797, 1772, 1750, 1730,
          1797, 1772, 1750, 1730]

aeras = ['IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR',
         'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR', 'IR',
         'Ball', 'Ball', 'Ball', 'Ball', 'Ball', 'Ball', 'Ball', 'Ball',
         'Ball', 'Ball', 'Ball', 'Ball', 'Ball', 'Ball', 'Ball', 'Ball',
         'OR', 'OR', 'OR', 'OR', 'OR', 'OR', 'OR', 'OR',
         'OR', 'OR', 'OR', 'OR']

fs = 12000

df_break = cal_feature(source_path, source_list, types, speeds, aeras, fs, resample=False)

# 整合数据结果
df = pd.concat([df_normal, df_break], ignore_index=True)

第二步：采用fisher_score方法，获取了两个敏感特征，最后采用线性判别模型及二次判别模型获取了故障尺寸的分类模型。

# # 提取当前故障尺寸下数据
data = df[(df['aera']=='IR') | (df['aera']=='Normal')].iloc[:, :-2]
# 对各特征进行归一化处理
scaler = MinMaxScaler()
values = scaler.fit_transform(data.values[:, :-1])
data.values[:, :-1] = values
# 获取特征名称
types = data.groupby('types').mean().index.values
# 获取排名前八的特征参数
cof_corr = get_fisher_score(data, types='types').sort_values()[-10:-2]
print(cof_corr)
# 截取分类效果较好的几个特征用于分析
index = list(cof_corr.index.values)
index.append('types')
data = data[index]
plt.figure(figsize=(5, 5))
# 展示不同故障尺寸下的特征分布情况
for _type in types:
    plt.scatter(data[data['types'] == _type][index[-4]], data[data['types'] == _type][index[-5]])
plt.xlabel(index[-4])
plt.ylabel(index[-5])
plt.legend(types)
plt.show()

# 构建训练集
x = data[[index[-4], index[-5]]].values
y = data['types']
# 修改类型名称（适应sklearn模型）
y[y==0.18] = 1
y[y==0.36] = 2
y[y==0.53] = 3
y[y==0.71] = 4

# 对模型采用线性判别分析进行分类
lin_dis = discriminant_analysis.LinearDiscriminantAnalysis()
lin_dis.fit(x, y)
# 构建判别结果
x1 = np.arange(0, 1.001, 0.001)
x2 = np.arange(0, 1.001, 0.001)
xx1, xx2 = np.meshgrid(x1, x2)
y_predict = []
for _xx1, _xx2 in zip(xx1, xx2):
    _y_predict = lin_dis.predict(np.array([list(_xx1), list(_xx2)]).T.reshape(-1,2))
    y_predict.append(list(_y_predict))
y_predict = np.array(y_predict)
# 展示判别结果
plt.figure(figsize=(5,5))
plt.pcolormesh(x1, x2, y_predict, cmap=plt.cm.Accent, alpha=0.5)
for _type in [0,1,2,3,4]:
    plt.scatter(x[y==_type][:,0], x[y==_type][:,1], marker='*')
plt.show()

# 对模型采用二次判别分析进行分类
qua_dis = discriminant_analysis.QuadraticDiscriminantAnalysis()
qua_dis.fit(x, y)
# 构建判别结果
x1 = np.arange(0, 1.001, 0.001)
x2 = np.arange(0, 1.001, 0.001)
xx1, xx2 = np.meshgrid(x1, x2)
y_predict = []
for _xx1, _xx2 in zip(xx1, xx2):
    _y_predict = qua_dis.predict(np.array([list(_xx1), list(_xx2)]).T.reshape(-1,2))
    y_predict.append(list(_y_predict))
y_predict = np.array(y_predict)
# 展示判别结果
plt.figure(figsize=(5,5))
plt.pcolormesh(x1, x2, y_predict, cmap=plt.cm.Accent, alpha=0.5)
for _type in [0,1,2,3,4]:
    plt.scatter(x[y==_type][:,0], x[y==_type][:,1], marker='*')
plt.show()

python Fisher score 特征选择 fisher判别python_开发语言