python多元回归分析 python做多元回归

转载
编程之翼 2023-08-29 20:54:51
文章标签 spark多元线性回归python 坐标轴权重方差 文章分类 Python 后端开发
这个函数又长又慢。。。。
# 导入包
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import copy
from scipy import stats # 求概率密度
from IPython.display import display # 使打印的表格对齐
import mpl_toolkits.axisartist as axisartist # 画箭头

class Regreession():
    '''
    df: 第一列为Y，其他列为X变量的DataFrame
    a: 置信系数
    standerd: 是否标准化数据
    weighting: 是否要加权估计，是请输入权重weighting，默认否False
    
    '''
    def __init__(self, df, weighting=False, a=0.05):
        self.df = df
        self.Y = df.iloc[:,0]
        X = df.iloc[:,1:]
        X.insert(0,'常数项',[1 for i in range(len(df))]) # X矩阵插入常数项
        self.X = X
        self.a = 0.05
        self.beta = Regreession.beta(self) # 调用下面的函数计算β
        self.y_hat = Regreession.y_hat(self)
        self.sigma2 = Regreession.sigma2_fun(self)
        self.e = self.Y-self.y_hat
        self.weighting = weighting # 权重，目前只应用于一元回归，以后再改
        self.DW = Regreession.DW_test(self)
        
    # 标准化数据
#     def standard_data(self):
#         Y = self.df.iloc[:,0]
#         Y = (Y-np.mean(Y))/np.sqrt(np.sum(np.square(Y-np.mean(Y))))
#         X = self.df.iloc[:,1:]
#         X = (X-np.mean(X))/np.sqrt(np.sum(np.square(X-np.mean(X))))
#         #X.insert(0,'常数项',[0 for i in range(len(self.df))]) # X矩阵插入常数项
#         return Y, X
    
    # 计算相关矩阵函数（增广矩阵）
    def r(self): 
        column = ['x%d'%i for i in range(1,len(self.df.columns))]
        column.insert(0,'y')
        df = pd.DataFrame(columns=column,index=column)
        # 默认第一列为y
        r = np.zeros(shape=(self.df.shape[1],self.df.shape[1]))  # 初始化简单相关系数矩阵
        for n1,i in enumerate(self.df):
            for n2,j in enumerate(self.df):
                a = np.sum((self.df[i]-np.mean(self.df[i]))*(self.df[j]-np.mean(self.df[j]))) # 分子
                b = np.sqrt(np.sum(np.square(self.df[i]-np.mean(self.df[i])))*np.sum(np.square(self.df[j]-np.mean(self.df[j])))) # 分母
                r[n1,n2] = a/b
                df.iloc[n1,n2] = round(a/b,3)
        
        r = np.around(r,3) # 保留3位小数
        print('简单相关系数矩阵：'); display(df)
        return r
        
    # 计算偏相关系数
    #def
    
    # 计算标准化和未标准化的β，返回β
    def beta(self):
        beta = np.dot(np.dot(np.linalg.inv(np.dot(np.transpose(self.X),self.X)),np.transpose(self.X)),self.Y) # (X'X)^(-1)=np.linalg.inv(np.dot(np.transpose(X),X))
        beta_ = np.around(beta,3)
#         if self.standard == True:
#             output_ = '标准化的回归方程：y = '
#             output2_ = [f' {beta_[i]}·x_{i}i + ' for i in range(0,len(beta_)-1)];
#             output3 = f' {beta_[len(beta)-1]}·x_{len(beta)-1}i'
#             print(output_,''.join(output2_),output3)
#         else:
        output = f'回归方程：y = {beta_[0]}'
        output2 = [f' + {beta_[i]}x_{i}' for i in range(1,len(beta_))]
        print(output,''.join(output2));print('')
        return beta
    
    # 预测y值，返回y预测值
    def y_hat(self,X=False):
        if X == False:                                            # 加入指定x1，x2，x3，求一个y时
            y_hat = np.dot(self.X,self.beta)
        else:
            y_hat = np.dot(X,self.beta)
        return y_hat
    
    # 计算可决系数R2  回归统计表
    def R2(self):
        n2 = self.Y.shape[0]-self.X.shape[1]; n3 = self.Y.shape[0]-1
        df = pd.DataFrame(columns=['负相关系数R','决定系数R^2','调整的R^2','回归标准误差的估计值Se','DW值'])
        R2 = np.sum(np.square(self.y_hat-np.mean(self.Y)))/np.sum(np.square(self.Y-np.mean(self.Y)))
        R2_ = 1- (np.sum(np.square(self.Y-self.y_hat))/n2) / (np.sum(np.square(self.Y-np.mean(self.Y)))/n3)
        DW = Regreession.DW_test(self)  # DW检验（自相关）
        df = df.append({'负相关系数R':round(np.sqrt(R2),3),'决定系数R^2':round(R2,3),'调整的R^2':round(R2_,3),
                        '回归标准误差的估计值Se':round(np.sqrt(self.sigma2),3),'DW值':round(DW,3)},ignore_index=True)
        df.index=['model']
        display(df)

    # F检验，方差分析表
    def F_test(self): 
        n1 = self.X.shape[1]-1; n2 = self.Y.shape[0]-self.X.shape[1]; n3 = self.Y.shape[0]-1
        ssr = np.sum(np.square(self.y_hat-np.mean(self.Y))); ssr = round(ssr,3)
        sse = np.sum(np.square(self.Y-self.y_hat)); sse = round(sse,3)
        sst = np.sum(np.square(self.Y-np.mean(self.Y))); sst = round(sst,3)
        SSR = ssr / n1
        SSE = sse / n2
        F = SSR/SSE
        p_value = 1 - stats.f.cdf(F,n1,n2); p_value = round(p_value,3) # 累计分布求分位数
        # 方差分析表
        df = pd.DataFrame(columns=['平方和','自由度','均方','F值','P值'],index=['SSR(回归)','SSE(残差)','SST(总计)'])
        df['自由度'] = [n1,n2,n3]
        df['平方和'] = [ssr,sse,sst]
        df['均方'] = [round(SSR,3),round(SSE,3),'']
        df['F值'] = [round(F,3),'','']
        df['P值'] = [p_value,'','']
        display(df)

    # 计算^σ2
    def sigma2_fun(self):
        e2 = np.sum(np.square(self.Y-self.y_hat))                  # 残差平方和e^2
        sigma2 = (1/(self.Y.shape[0]-self.X.shape[1]))*e2         # 方差预测^σ^2
        return sigma2
    
    # t检验,β置信区间
    def t_test(self):
        '''
        confidence_interval: 是否显示置信水平
        a: 置信水平
        '''
        df = pd.DataFrame(index=self.X.columns,columns=['系数','标准差','标准化系数','t检验_p值','置信区间']) # ,'零阶','偏相关系数','部分相关系数'
        df.index.name = '变量'
        df['系数'] = self.beta
        df['系数'] = df['系数'].apply(lambda x:round(x,3)) # 保留三位小数
        # 标准化回归系数
        Ljj = np.sqrt(np.sum(np.square(self.X-np.mean(self.X)))); Lyy = np.sqrt(np.sum(np.square(self.Y-np.mean(self.Y))))
        df['标准化系数'] = (Ljj/Lyy).values * self.beta
        df['标准化系数'] = df['标准化系数'].apply(lambda x: round(x,3))
        X_X = np.linalg.inv(np.dot(np.transpose(self.X),self.X)) # (X'X)^(-1)
        for n,j in enumerate(self.beta):
            cjj = X_X[n,n]
            beta_sigma = np.sqrt(self.sigma2*cjj)
            t_j = self.beta[n]/beta_sigma
            free_n = self.Y.shape[0]-self.X.shape[1]          # 自由度 n-p-1
            p_j = 1 - stats.t.cdf(t_j,free_n)
            df.iloc[n,3] = round(p_j*2,3)                     # t检验—p值
            df.iloc[n,1] = round(beta_sigma,3)             # 标准
            # 置信区间
            beta_lp = self.beta[n] + stats.t.ppf(self.a/2,free_n)*np.sqrt(self.sigma2*cjj)
            beta_ub = self.beta[n] - stats.t.ppf(self.a/2,free_n)*np.sqrt(self.sigma2*cjj)
            df.iloc[n,4] = '[%.3f，%.3f]'%(beta_lp,beta_ub)
            df.loc['常数项','标准化系数'] = ''
        display(df)
    
    # 预测y置信区间
    def y_hat_interval(self,X):
        X.insert(0,1)                         # 常数项
        y_hat = Regreession.y_hat(self,X)
        y_hat_lb = y_hat - 2*np.sqrt(self.sigma2); y_hat_lb = round(y_hat_lb,3)
        y_hat_ub = y_hat + 2*np.sqrt(self.sigma2); y_hat_ub = round(y_hat_ub,3)
        print('y的预测值为%.4f'%y_hat)
        print(f'y的95%预测区间为 [ {y_hat_lb}, {y_hat_ub} ]')
    
    # 残差散点图——检验异方差
    def e_picture(self):
        fig, ax = plt.subplots()
        ax.spines['right'].set_visible(False) # 去掉顶边框
        ax.spines['top'].set_visible(False)   # 去掉右边框
        markers = ['.','*','v','d','o']                      # 注，这里只加了几个而已
        for n,i in enumerate(self.X.drop(columns='常数项')):
            plt.scatter(self.X[i].values,self.e.values, label=i, marker=markers[n])
        plt.xlabel('自变量x',fontsize=13); plt.ylabel(r'残差$e_i$',fontsize=13); # plt.title('残差散点图')
        if n != 0:
            plt.legend()
        plt.axhline(y=0,c="black", lw=0.5)#添加水平直线
        plt.tick_params(labelsize=13) #刻度字体大小13
        
    # 等级相关系数法 —— 异方差检验
    def rs_test(self):
        '''
        weighting：是否用加权值进行计算？ 是weighting=True
        注：等级相关系数法好像只用于一元回归？
        一元回归时，这里的自变量默认命名为x
        '''
        dic_x = self.X.x.sort_values().reset_index().reset_index().set_index('x')['level_0'].to_dict()
        level_x = self.X.x.apply(lambda x: dic_x[x]).values      # x等级
        if type(self.weighting)==float or type(self.weighting)==int :  # 如果进行权重估计，用e_iw计算 
            self.e_iw = Regreession.e_iw_fun(self)
            dic_e = abs(self.e_iw.rename('e_iw')).sort_values().reset_index().reset_index().set_index('e_iw')['level_0'].to_dict()   # 因为前面一些series没有命名的原因，继承了y的名字
            level_e = abs(self.e_iw).apply(lambda x: dic_e[x]).values   # e等级， 注意：是e的绝对值！！！abs(e)
            a = '加权后的'
        else:
            dic_e = abs(self.e).sort_values().reset_index().reset_index().set_index('y')['level_0'].to_dict()   # 因为前面一些series没有命名的原因，继承了y的名字
            level_e = abs(self.e).apply(lambda x: dic_e[x]).values   # e等级， 注意：是e的绝对值！！！abs(e)
            a = ''
        n = len(df49)
        di = level_x - level_e
        r_s = 1 - (6/(n*(np.square(n)-1)))*np.sum(np.square(di)) # 次幂不能用^,要用np.square()!!!
        t = np.sqrt(n-2)*r_s/np.sqrt(1-np.square(r_s))           # t统计量
        p = 1 - stats.t.cdf(abs(t),n-2)                               # p值
        
        print(a,"斯皮尔曼检验：等级相关系数 r_s = %.4f , t = %.4f ， p值 = %.4f"%(r_s,t,p))
    
    # 加权残差，返回e_iw
    def e_iw_fun(self):
        wi = 1/self.X.x**(self.weighting)
        e_iw = np.sqrt(wi)*self.e
        return e_iw      # 返回e_iw
    
    # 残差图——检验自相关
    def e_picture_auto(self):
        fig = plt.figure(figsize=(8, 8))                   #创建画布
        ax = axisartist.Subplot(fig, 111)                  #使用axisartist.Subplot方法创建一个绘图区对象ax
        fig.add_axes(ax)                                   #将绘图区对象添加到画布中
        ax.axis[:].set_visible(False)                      #通过set_visible方法设置绘图区所有坐标轴隐藏
        ax.axis["x"] = ax.new_floating_axis(0,0)           #ax.new_floating_axis代表添加新的坐标轴
        ax.axis["x"].set_axisline_style("->", size = 1.0)  #给x坐标轴加上箭头
        ax.axis["y"] = ax.new_floating_axis(1,0)           
        ax.axis["y"].set_axisline_style("-|>", size = 1.0) #添加y坐标轴，且加上箭头
        ax.axis["x"].set_axis_direction("top")             #设置x、y轴上刻度显示方向
        ax.axis["y"].set_axis_direction("right")
        # 画画
        plt.scatter(self.e.values[:-1],self.e.values[1:])
        plt.annotate(r"$e_t$", (-0.02,0.2), xycoords='data',xytext=(-0.001,0.8),fontsize=15) # ,arrowprops=dict(arrowstyle='-') # y标签
        plt.annotate(r"$e_{t-1}$", (0.2,-0.02), xycoords='data',xytext=(0.8,-0.05),fontsize=15) # x标签
        plt.annotate(r'$O$',(0,0),xycoords='data',xytext=(-0.075,-0.075),fontsize=15)
#         plt.ylim([-0.34,0.34]); plt.xlim([-0.34,0.34])
        plt.xticks([-0.8,-0.4,0,0.4,0.8],['-0.8','-0.4','','0.4','0.8']) # x标签
        plt.yticks([-0.8,-0.4,0,0.4,0.8],['-0.8','-0.4','','0.4','0.8']) # y标签
        
    # DW检验自相关
    def DW_test(self): # n=20,k=2时的DW临界值
        DW = np.sum(np.square(self.e.values[1:]-self.e.values[:-1]))/np.sum(np.square(self.e.values[1:]))
#         print('DW值为%.3f'%DW)
        return DW
    
    # 迭代法补救自相关性
    def iteration(self):
        rho = np.sum(self.e.values[1:]*self.e.values[:-1])/np.sum(np.square(self.e.values[1:])) # 误差项一阶自相关系数 \rho
        print('rho=',rho)
        df = self.df.iloc[:-1,:]*rho - self.df.iloc[1:,:].values
        print('一次迭代法：')
        rs = Regreession(df)
        print('DW值为%.3f'%rs.DW_test())
        rs.summary()
        return df
    
    # 差分法补救自相关性
    def ChaFen(self):
        df = self.df.iloc[1:,:].values - self.df.iloc[:-1,:]  # 注意这里用DataFrame/Series运算的时候一定要用.values!!!!，不然他会按照索引对齐运算，就全都等于0了。。。
        print('一次差分法：\n')
        rs = Regreession(df)
        rs.DW_test(); print()
        rs.summary()
    
    # 总结果
    def summary(self):
        Regreession.r(self)      # 简单相关系数矩阵
        print('')
        Regreession.R2(self)          # 可决系数R^2
        Regreession.F_test(self) # F检验
        Regreession.t_test(self) # t检验
        # Regreession.e_picture(self) # 残差图
python多元回归分析 python做多元回归_spark多元线性回归python