correlative coefficient 相关系数 covariation 协方差
krutosis 峰度 skruness 偏度
虽然我现在未必记得这些函数的用法,但是我曾经知道过,这就够了~~
目录
- 1. calculus by sympy+scipy
- 符号表达式→anonymous函数
- sympy求解方程
- sympy解差分方程
- scipy与数值解
- 2. numpy.linalg
- 最小二乘法拟合
- 3. probability
- 4. premium test
- 5. variance analysis
- 6. Univariate linear regression model
1. calculus by sympy+scipy
from sympy import *
#expressive confer value
expression.subs({x:0,y:1}) #tuple/dict
#limit
limit(sin(x),x,0); limit(pow(1+1/x),x,oo)
#partial differential(偏微分)
diff(sin(x)+x**2+exp(y),x,2)
#sum series
factor(summation(k**2,(k,1,n)))
#taylor unfold
series(sin(x),x,0,3)
#integrition
integrate(sin(x+y),(x,0,pi),(y,0,pi))#x,y是符号
#algebra equation(group)
solve([x**2+y**2-1,x-y],[x,y])
#partial differential quation numerical result
solve(diff(x**3-1),x)
#partial differential quation symbolic result
x=symbols('x'); y=symbols('y',cls=Function)
y=dsolve(diff(exp(2*x)+y(x)**2),y(x),ics={y(0):1}) #Eq(y(x),y_expr)
符号表达式→anonymous函数
s=sp.lambdify(x,y.args[1],'numpy') #做科学计算调用的库
s(pi/2)
sympy求解方程
- sympy只能解一阶常微分方程和高阶常系数偏微分方程
- 可解代数方程、常微分方程
sympy解差分方程
from sympy import Function,rsolve
from sympy.abc import n
- 齐次线性
- 非齐次线性
LesLie模型(多变量差分)
scipy与数值解
#scipy.integrate
from scipy.integrate import quad;
quad(lambda x:sin(x),0,1)
dblquad(f,0,1,g(x),h(x))##f的最后一个参数为最外层,对应dblquad的第一对取值
- 非线性方程数值解可用二分/Newton迭代
#scipy.optimize
from scipy.optimize import fsolve,fminbound,fmin,minimize
#非线性方程数值解
fsolve(f,[100,1]) #x[0],x[1]初值为100,1
#一元函数极值
fminbound(f,a,b); #在[a,b]上的最小值
fmin(f,x0) #
#多元函数极值
x0=minimize(f,[2.0,2.0]); #2.0,2.0为初值
x0.x; x0.fun #极小处x和极小值
2. numpy.linalg
- 轴=数组的层级,第0层=数组最外层"[]",
若函数中axis=i, 则沿着第i个下标变化、其余下标都不变的方向进行操作 - np.tile(A,(3,1)) #贴瓷砖 tile the tile
- 矩阵:@(对np.array)矩阵相乘=np.matmul,*对np.mat=np.matmul,(对np.array)逐个元素相乘=np.multiply,
- np.dot(a, b, out=None):对于二维数组,它相当于矩阵的乘法;对于一维数组,则是向量的内积;而对于n维,它是a的最后一个轴向和b的倒数第二个轴向的乘积和。
- np.outer(1-d vec,1-d vec),产生两一维向量的外积
- C=np.inner(A,B); Cij=A[i]*B[j]
#numpy
import numpy as np
np.eye(4); np.ones(4); np.zeros(4); np.ones_like(A); np.vander([1,2,3,4]); #vandermo
np.diag(4)#diag make 1-d to 对角阵,2-d取对角元
np.dot(); np.inner(); np.outer(); np.trace(); np.transpose() #trace为主对角线元素的和
x=np.arange(9).reshape(3,3)
print(np.diag(x)) #[0,4,8]
print(np.diag(x,k=1)) #[1,5]
- Watch Out! a=np.arange(8).reshape(2,4); a[0]是第0行而非第0列
#numpy.linalg
import numpy.linalg as LA
LA.det(); LA.eig(); LA.eigvals(); LA.inv(); LA.pinv();
- [val,vec]=LA.eig®; vec[:,0]\vec[:,1]\…vec[:,k-1]为val[0],…,val[k-1]的特征向量
LA.solve(A,b); #只可求唯一解
LA.pinv(A).dot(b); #可求最小范数解
- svd
m=np.arange(1,7).reshape(2,3)
U,a1,V=np.linalg.svd(m)
A1=np.zeros((len(U),len(V)))
for i,j in enumerate(a1):
A1[i][i]=j
print(U.dot(A1.dot(V)))
- hstack(),vstack(),c_(),r_()
- print([x,x])会炸
#np.lstsq(超定线性方程组的最小二乘法解)
LA.lstsq()[0]; LA.qr(); LA.svd(); LA.norm(); LA.matrix_rank()
回归系数,残差平方和,自变量X的秩,X的奇异值=LA.lstsq() #X·ans=Y
a,b=LA.lstsq()[0]
- LA.norm(x,ord=np.inf,axis=1/0/None,keepdims=False)
(|x1|ord+…+|xn|ord)^(1/ord)
最小二乘法拟合
- x=LA.pinv(b).dot©
- x=np.polyfit(x,y)
- md=statemodels.formula.ols(formula,data).fit() #data用字典 md.summary(); md.predict({‘x1’:a,‘x2’:})
或者md=statemodels.api.OLS(y,X).fit(); md.params(); md.predict(X) - sklearn.linear_model.LinearRegression().fit(x,y)
3. probability
adopt scipy.stats
from scipy.stats import *
#distribution catogery
uniform.pdf(x,a,b) #uniform distribution
expon.pdf(x,theta) #exponential distribution
norm.pdf(x,mu,sigma) #normal distribution
chi2.pdf(x,n) #Chi-square dis
t.pdf(x,n) #
f.pdf(x,m,n)
gamma.pdf(x,A,B)
@ binom.pmf(x,n,p); geom.pmf(x,p); poisson(x,lambda) #首次成功概率
#distribution properties
norm.pdf(x,mu,sigma) #probability density function
norm.cdf(6,mean,variance) #cumulative density function
norm.ppf(0.95,mean,variance); #probability percentage function
norm.rvs(mu,sigma,size=N) #produce N random number in normal distribution
norm.fit(a) #return maximum likelihood estimation of a's mu&sigma
bi_mean,bi_variance=binorm.stats(n,p)
bi_mean,bi_variance,bi_skewness,bi_kurtosis=binorm.stats(n,p,moments='mvsk')
import numpy as np
h=np.arange(10).reshape(2,5).flatten()
w=np.arange(10)*2.reshape(2,5).flatten()
hw=hstack([h,w]); hw1=vstack([h,w])
np.mean(h); np.median(h); np.ptp(h); np.var(h); np.std(h)
cov(hw.T); corrcoef(hw.T) #covariance; correlative coefficient
4. premium test
import scipy.stats as ss
#区间估计of t dis
ss.t.interval(alpha,df=len(a)-1,loc=a.mean(),scale=ss.sem(a))#a=np.array(range(10))
单个总体均值的假设检验
#acquire t
tstat1,pvalue=ztest(a,value=0.5)
#query z,s=a.std(ddof)
tstat2=tstat1*a.std(ddof=1)/sigma
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zUZjFZb5-1678628197195)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20230118143715639.png)]
#两个总体均值的假设检验
tstat,pvalue,df=ttest_ind(a,b,value=0) #value=\mu_a-\mu_b
- 拟合优度is appropriate() ,not optimal
#无限值且分布含未知参数的处理
import scipy.stats as ss
statVal,pVal=ss.kstest(w,'norm',(mu,s)) #样本均值、方差作为\mu,\sigma的估计,
#1. 只含有限值且完全已知
import numpy as np
import scipy.stats as ss
bins=np.arange(1,8)
mi=np.array([36,23,29,31,34,60,25])
n=mi.sum(); p=np.ones(7)/7
cha=(mi-n*p)**2/(n*p); st=cha.sum()
bd=ss.chi2.ppf(0.95,len(bins)-1)
print("statistics={},border value={}".format(st,bd))
#2. 只含有限值且分布含未知参数_用极大似然估计
#np.diff(a,n=1,axis=0)
#3. 含无限值且分布含未知参数
#通过划分区间把无限值转为有限值,再用极大似然估计;一般用均值、样本方差替代
#卡方表那个是P(Z>=自由度为n的卡方分布的取值b)=a时的b,P=该分布下出现b的概率
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as ss
a=np.loadtxt("./data/04/Pdata4_6_2.txt")
w=a[:,1::2]; w=w.flatten()
mu=w.mean(); s=w.std(ddof=1)
print('mu,s=',mu,s)
statVal,pVal=ss.kstest(w,'norm',(mu,s)) #样本均值、方差作为\mu,\sigma的估计,
print('统计量和P值分别为',[statVal,pVal]) #统计量就是书上的Z,p是p(Z>=z)
5. variance analysis
from statsmodels.api as sm
model=sm.formula.ols("y~C(x1)+C(x2)+C(x1):C(x2)",d).fit() #构建模型
anovat=sm.stats.anova_lm(model) #进行单因素方差分析
#single variable
import numpy as np
import pandas as pd
import statsmodels.api as sm
df=pd.read_excel('data/04/Pdata4_23.xlsx',header=None) #header=None
a=df.values.T.flatten()
b=np.arange(1,6)
x=np.tile(b,(4,1)).T.flatten()
d={'x':x,'y':a}
model=sm.formula.ols("y~(x)",d).fit()
anovat=sm.stats.anova_lm(model)
print(anovat)
#multiple variable
import numpy as np
import statsmodels.api as sm
y=np.array([[11,11,13,10],[10,11,9,12],[9,10,7,6],[7,8,11,10],[5,13,12,14],[11,14,13,10]]).flatten()
A=np.tile(np.arange(1,5),(6,1)).flatten()
B=np.tile(np.arange(1,4),(8,1)).T.flatten()
d={'x1':A,'x2':B,'y':y}
model=sm.formula.ols("y~C(x1)+C(x2)+C(x1):C(x2)",d).fit()
anovat=sm.stats.anova_lm(model)
print(anovat)
6. Univariate linear regression model
import numpy as np
w,b=np.polyfit(x,y,deg=1)
#basd on formula
import statsmodels.api as sm
res=sm.formula.ols('y~x',df).fit()
print(res.summary(),'\n')
ypred=res.predict(dict(x=8))
print('预测值',list(ypred))
#based on array
X=sm.add_constant(x)
md=sm.OLS(y,X).fit() #build and fit model
print(md.param,'\n----\n') #extract regression coefficent
print(md.summary2())
ypred=md.predict([1,8])
print("预测值=",ypred)
- 差别只在得到res/md的方式
- 残差的自由度是n-2?