这个函数又长又慢。。。。
# 导入包
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import copy
from scipy import stats # 求概率密度
from IPython.display import display # 使打印的表格对齐
import mpl_toolkits.axisartist as axisartist # 画箭头
class Regreession():
'''
df: 第一列为Y,其他列为X变量的DataFrame
a: 置信系数
standerd: 是否标准化数据
weighting: 是否要加权估计,是请输入权重weighting,默认否False
'''
def __init__(self, df, weighting=False, a=0.05):
self.df = df
self.Y = df.iloc[:,0]
X = df.iloc[:,1:]
X.insert(0,'常数项',[1 for i in range(len(df))]) # X矩阵插入常数项
self.X = X
self.a = 0.05
self.beta = Regreession.beta(self) # 调用下面的函数计算β
self.y_hat = Regreession.y_hat(self)
self.sigma2 = Regreession.sigma2_fun(self)
self.e = self.Y-self.y_hat
self.weighting = weighting # 权重,目前只应用于一元回归,以后再改
self.DW = Regreession.DW_test(self)
# 标准化数据
# def standard_data(self):
# Y = self.df.iloc[:,0]
# Y = (Y-np.mean(Y))/np.sqrt(np.sum(np.square(Y-np.mean(Y))))
# X = self.df.iloc[:,1:]
# X = (X-np.mean(X))/np.sqrt(np.sum(np.square(X-np.mean(X))))
# #X.insert(0,'常数项',[0 for i in range(len(self.df))]) # X矩阵插入常数项
# return Y, X
# 计算相关矩阵函数(增广矩阵)
def r(self):
column = ['x%d'%i for i in range(1,len(self.df.columns))]
column.insert(0,'y')
df = pd.DataFrame(columns=column,index=column)
# 默认第一列为y
r = np.zeros(shape=(self.df.shape[1],self.df.shape[1])) # 初始化简单相关系数矩阵
for n1,i in enumerate(self.df):
for n2,j in enumerate(self.df):
a = np.sum((self.df[i]-np.mean(self.df[i]))*(self.df[j]-np.mean(self.df[j]))) # 分子
b = np.sqrt(np.sum(np.square(self.df[i]-np.mean(self.df[i])))*np.sum(np.square(self.df[j]-np.mean(self.df[j])))) # 分母
r[n1,n2] = a/b
df.iloc[n1,n2] = round(a/b,3)
r = np.around(r,3) # 保留3位小数
print('简单相关系数矩阵:'); display(df)
return r
# 计算偏相关系数
#def
# 计算标准化和未标准化的β,返回β
def beta(self):
beta = np.dot(np.dot(np.linalg.inv(np.dot(np.transpose(self.X),self.X)),np.transpose(self.X)),self.Y) # (X'X)^(-1)=np.linalg.inv(np.dot(np.transpose(X),X))
beta_ = np.around(beta,3)
# if self.standard == True:
# output_ = '标准化的回归方程:y = '
# output2_ = [f' {beta_[i]}·x_{i}i + ' for i in range(0,len(beta_)-1)];
# output3 = f' {beta_[len(beta)-1]}·x_{len(beta)-1}i'
# print(output_,''.join(output2_),output3)
# else:
output = f'回归方程:y = {beta_[0]}'
output2 = [f' + {beta_[i]}x_{i}' for i in range(1,len(beta_))]
print(output,''.join(output2));print('')
return beta
# 预测y值,返回y预测值
def y_hat(self,X=False):
if X == False: # 加入指定x1,x2,x3,求一个y时
y_hat = np.dot(self.X,self.beta)
else:
y_hat = np.dot(X,self.beta)
return y_hat
# 计算可决系数R2 回归统计表
def R2(self):
n2 = self.Y.shape[0]-self.X.shape[1]; n3 = self.Y.shape[0]-1
df = pd.DataFrame(columns=['负相关系数R','决定系数R^2','调整的R^2','回归标准误差的估计值Se','DW值'])
R2 = np.sum(np.square(self.y_hat-np.mean(self.Y)))/np.sum(np.square(self.Y-np.mean(self.Y)))
R2_ = 1- (np.sum(np.square(self.Y-self.y_hat))/n2) / (np.sum(np.square(self.Y-np.mean(self.Y)))/n3)
DW = Regreession.DW_test(self) # DW检验(自相关)
df = df.append({'负相关系数R':round(np.sqrt(R2),3),'决定系数R^2':round(R2,3),'调整的R^2':round(R2_,3),
'回归标准误差的估计值Se':round(np.sqrt(self.sigma2),3),'DW值':round(DW,3)},ignore_index=True)
df.index=['model']
display(df)
# F检验,方差分析表
def F_test(self):
n1 = self.X.shape[1]-1; n2 = self.Y.shape[0]-self.X.shape[1]; n3 = self.Y.shape[0]-1
ssr = np.sum(np.square(self.y_hat-np.mean(self.Y))); ssr = round(ssr,3)
sse = np.sum(np.square(self.Y-self.y_hat)); sse = round(sse,3)
sst = np.sum(np.square(self.Y-np.mean(self.Y))); sst = round(sst,3)
SSR = ssr / n1
SSE = sse / n2
F = SSR/SSE
p_value = 1 - stats.f.cdf(F,n1,n2); p_value = round(p_value,3) # 累计分布求分位数
# 方差分析表
df = pd.DataFrame(columns=['平方和','自由度','均方','F值','P值'],index=['SSR(回归)','SSE(残差)','SST(总计)'])
df['自由度'] = [n1,n2,n3]
df['平方和'] = [ssr,sse,sst]
df['均方'] = [round(SSR,3),round(SSE,3),'']
df['F值'] = [round(F,3),'','']
df['P值'] = [p_value,'','']
display(df)
# 计算^σ2
def sigma2_fun(self):
e2 = np.sum(np.square(self.Y-self.y_hat)) # 残差平方和e^2
sigma2 = (1/(self.Y.shape[0]-self.X.shape[1]))*e2 # 方差预测^σ^2
return sigma2
# t检验,β置信区间
def t_test(self):
'''
confidence_interval: 是否显示置信水平
a: 置信水平
'''
df = pd.DataFrame(index=self.X.columns,columns=['系数','标准差','标准化系数','t检验_p值','置信区间']) # ,'零阶','偏相关系数','部分相关系数'
df.index.name = '变量'
df['系数'] = self.beta
df['系数'] = df['系数'].apply(lambda x:round(x,3)) # 保留三位小数
# 标准化回归系数
Ljj = np.sqrt(np.sum(np.square(self.X-np.mean(self.X)))); Lyy = np.sqrt(np.sum(np.square(self.Y-np.mean(self.Y))))
df['标准化系数'] = (Ljj/Lyy).values * self.beta
df['标准化系数'] = df['标准化系数'].apply(lambda x: round(x,3))
X_X = np.linalg.inv(np.dot(np.transpose(self.X),self.X)) # (X'X)^(-1)
for n,j in enumerate(self.beta):
cjj = X_X[n,n]
beta_sigma = np.sqrt(self.sigma2*cjj)
t_j = self.beta[n]/beta_sigma
free_n = self.Y.shape[0]-self.X.shape[1] # 自由度 n-p-1
p_j = 1 - stats.t.cdf(t_j,free_n)
df.iloc[n,3] = round(p_j*2,3) # t检验—p值
df.iloc[n,1] = round(beta_sigma,3) # 标准
# 置信区间
beta_lp = self.beta[n] + stats.t.ppf(self.a/2,free_n)*np.sqrt(self.sigma2*cjj)
beta_ub = self.beta[n] - stats.t.ppf(self.a/2,free_n)*np.sqrt(self.sigma2*cjj)
df.iloc[n,4] = '[%.3f,%.3f]'%(beta_lp,beta_ub)
df.loc['常数项','标准化系数'] = ''
display(df)
# 预测y置信区间
def y_hat_interval(self,X):
X.insert(0,1) # 常数项
y_hat = Regreession.y_hat(self,X)
y_hat_lb = y_hat - 2*np.sqrt(self.sigma2); y_hat_lb = round(y_hat_lb,3)
y_hat_ub = y_hat + 2*np.sqrt(self.sigma2); y_hat_ub = round(y_hat_ub,3)
print('y的预测值为%.4f'%y_hat)
print(f'y的95%预测区间为 [ {y_hat_lb}, {y_hat_ub} ]')
# 残差散点图——检验异方差
def e_picture(self):
fig, ax = plt.subplots()
ax.spines['right'].set_visible(False) # 去掉顶边框
ax.spines['top'].set_visible(False) # 去掉右边框
markers = ['.','*','v','d','o'] # 注,这里只加了几个而已
for n,i in enumerate(self.X.drop(columns='常数项')):
plt.scatter(self.X[i].values,self.e.values, label=i, marker=markers[n])
plt.xlabel('自变量x',fontsize=13); plt.ylabel(r'残差$e_i$',fontsize=13); # plt.title('残差散点图')
if n != 0:
plt.legend()
plt.axhline(y=0,c="black", lw=0.5)#添加水平直线
plt.tick_params(labelsize=13) #刻度字体大小13
# 等级相关系数法 —— 异方差检验
def rs_test(self):
'''
weighting:是否用加权值进行计算? 是weighting=True
注:等级相关系数法好像只用于一元回归?
一元回归时,这里的自变量默认命名为x
'''
dic_x = self.X.x.sort_values().reset_index().reset_index().set_index('x')['level_0'].to_dict()
level_x = self.X.x.apply(lambda x: dic_x[x]).values # x等级
if type(self.weighting)==float or type(self.weighting)==int : # 如果进行权重估计,用e_iw计算
self.e_iw = Regreession.e_iw_fun(self)
dic_e = abs(self.e_iw.rename('e_iw')).sort_values().reset_index().reset_index().set_index('e_iw')['level_0'].to_dict() # 因为前面一些series没有命名的原因,继承了y的名字
level_e = abs(self.e_iw).apply(lambda x: dic_e[x]).values # e等级, 注意:是e的绝对值!!!abs(e)
a = '加权后的'
else:
dic_e = abs(self.e).sort_values().reset_index().reset_index().set_index('y')['level_0'].to_dict() # 因为前面一些series没有命名的原因,继承了y的名字
level_e = abs(self.e).apply(lambda x: dic_e[x]).values # e等级, 注意:是e的绝对值!!!abs(e)
a = ''
n = len(df49)
di = level_x - level_e
r_s = 1 - (6/(n*(np.square(n)-1)))*np.sum(np.square(di)) # 次幂不能用^,要用np.square()!!!
t = np.sqrt(n-2)*r_s/np.sqrt(1-np.square(r_s)) # t统计量
p = 1 - stats.t.cdf(abs(t),n-2) # p值
print(a,"斯皮尔曼检验:等级相关系数 r_s = %.4f , t = %.4f , p值 = %.4f"%(r_s,t,p))
# 加权残差,返回e_iw
def e_iw_fun(self):
wi = 1/self.X.x**(self.weighting)
e_iw = np.sqrt(wi)*self.e
return e_iw # 返回e_iw
# 残差图——检验自相关
def e_picture_auto(self):
fig = plt.figure(figsize=(8, 8)) #创建画布
ax = axisartist.Subplot(fig, 111) #使用axisartist.Subplot方法创建一个绘图区对象ax
fig.add_axes(ax) #将绘图区对象添加到画布中
ax.axis[:].set_visible(False) #通过set_visible方法设置绘图区所有坐标轴隐藏
ax.axis["x"] = ax.new_floating_axis(0,0) #ax.new_floating_axis代表添加新的坐标轴
ax.axis["x"].set_axisline_style("->", size = 1.0) #给x坐标轴加上箭头
ax.axis["y"] = ax.new_floating_axis(1,0)
ax.axis["y"].set_axisline_style("-|>", size = 1.0) #添加y坐标轴,且加上箭头
ax.axis["x"].set_axis_direction("top") #设置x、y轴上刻度显示方向
ax.axis["y"].set_axis_direction("right")
# 画画
plt.scatter(self.e.values[:-1],self.e.values[1:])
plt.annotate(r"$e_t$", (-0.02,0.2), xycoords='data',xytext=(-0.001,0.8),fontsize=15) # ,arrowprops=dict(arrowstyle='-') # y标签
plt.annotate(r"$e_{t-1}$", (0.2,-0.02), xycoords='data',xytext=(0.8,-0.05),fontsize=15) # x标签
plt.annotate(r'$O$',(0,0),xycoords='data',xytext=(-0.075,-0.075),fontsize=15)
# plt.ylim([-0.34,0.34]); plt.xlim([-0.34,0.34])
plt.xticks([-0.8,-0.4,0,0.4,0.8],['-0.8','-0.4','','0.4','0.8']) # x标签
plt.yticks([-0.8,-0.4,0,0.4,0.8],['-0.8','-0.4','','0.4','0.8']) # y标签
# DW检验自相关
def DW_test(self): # n=20,k=2时的DW临界值
DW = np.sum(np.square(self.e.values[1:]-self.e.values[:-1]))/np.sum(np.square(self.e.values[1:]))
# print('DW值为%.3f'%DW)
return DW
# 迭代法补救自相关性
def iteration(self):
rho = np.sum(self.e.values[1:]*self.e.values[:-1])/np.sum(np.square(self.e.values[1:])) # 误差项一阶自相关系数 \rho
print('rho=',rho)
df = self.df.iloc[:-1,:]*rho - self.df.iloc[1:,:].values
print('一次迭代法:')
rs = Regreession(df)
print('DW值为%.3f'%rs.DW_test())
rs.summary()
return df
# 差分法补救自相关性
def ChaFen(self):
df = self.df.iloc[1:,:].values - self.df.iloc[:-1,:] # 注意这里用DataFrame/Series运算的时候一定要用.values!!!!,不然他会按照索引对齐运算,就全都等于0了。。。
print('一次差分法:\n')
rs = Regreession(df)
rs.DW_test(); print()
rs.summary()
# 总结果
def summary(self):
Regreession.r(self) # 简单相关系数矩阵
print('')
Regreession.R2(self) # 可决系数R^2
Regreession.F_test(self) # F检验
Regreession.t_test(self) # t检验
# Regreession.e_picture(self) # 残差图