描述性统计分析:
import numpy as np
import pandas as pd
inputfile=('data.csv')
data = pd.read_csv(inputfile)
description=[data.min(),data.max(),data.mean(),data.std()]
description=pd.DataFrame(description,index=['Min','Max','Mean','STD']).T
print('描述性统计结果:\n',np.round(description,2))
相关性分析:
corr=data.corr(method='pearson')
print('相关系数矩阵为:\n',np.round(corr,2))
绘制相关性热力图:
import matplotlib.pyplot as plt
import seaborn as sns
plt.subplots(figsize=(10,10))
sns.heatmap(corr,annot=True,vmax=1,square=True,cmap="Blues")
plt.title('相关性热力图3116')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.show()
plt.close
数据预处理:Lasso回归:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
inputfile=('data.csv')
data = pd.read_csv(inputfile)
lasso=Lasso(1000)
lasso.fit(data.iloc[:,0:13],data['y'])
print('相关系数为:',np.round(lasso.coef_,5))
print('相关系数非零个数为:',np.sum(lasso.coef_!=0))
mask=lasso.coef_!=0
print('相关系数是否为零:',mask)
outputfile='new_reg_data.csv'
mask=np.append(mask,True)
new_reg_data=data.iloc[:,mask]
new_reg_data.to_csv(outputfile)
print('输出数据的维度为:',new_reg_data.shape)
构建财政收入的预测模型:灰色预测:
import sys
sys.path.append('D:/数据挖掘/chapter')
import numpy as np
import pandas as pd
from GM11 import GM11
inputfile1='new_reg_data.csv'
inputfile2='data.csv'
new_reg_data=pd.read_csv(inputfile1)
data=pd.read_csv(inputfile2)
new_reg_data.index=range(1994,2014)
new_reg_data.loc[2014]=None
new_reg_data.loc[2015]=None
cols=['x1','x3','x4','x5','x6','x7','x8','x13']
for i in cols:
f=GM11(new_reg_data.loc[range(1994,2014),i].values)[0]
new_reg_data.loc[2014,i]=f(len(new_reg_data)-1)
new_reg_data.loc[2015,i]=f(len(new_reg_data))
new_reg_data[i]=new_reg_data[i].round(2)
outputfile='new_reg_data_GM11.xls'
y=list(data['y'].values)
y.extend([np.nan,np.nan])
new_reg_data['y']=y
new_reg_data.to_excel(outputfile)
print('预测结果为:\n',new_reg_data.loc[2014:2015,:])
2016年的预测:
import sys
sys.path.append('D:/数据挖掘/chapter')
import numpy as np
import pandas as pd
from GM11 import GM11
inputfile1='new_reg_data_GM11.xls'
inputfile2='data.csv'
new_reg_data=pd.read_excel(inputfile1)
data=pd.read_csv(inputfile2)
new_reg_data.index=range(1994,2016)
#new_reg_data.loc[2014]=None
new_reg_data.loc[2016]=None
cols=['x1','x3','x4','x5','x6','x7','x8','x13']
for i in cols:
f=GM11(new_reg_data.loc[range(1994,2015),i].values)[0]
new_reg_data.loc[2016,i]=f(len(new_reg_data)-1)
# new_reg_data.loc[2015,i]=f(len(new_reg_data))
new_reg_data[i]=new_reg_data[i].round(2)
outputfile='new_reg_data_GM11_2.xls'
y=list(data['y'].values)
y.extend([np.nan,np.nan])
#new_reg_data['y']=y
new_reg_data.to_excel(outputfile)
print('预测结果为:\n',new_reg_data.loc[2016,:])
构建支持向量回归预测模型:
import matplotlib.pyplot as plt
from sklearn.svm import LinearSVR
inputfile = 'new_reg_data_GM11.xls' # 灰色预测后保存的路径
data = pd.read_excel(inputfile) # 读取数据
feature = ['x1', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x13'] # 属性所在列
data.index = range(1994,2016)
data_train = data.loc[range(1994,2014)].copy() # 取2014年前的数据建模
data_mean = data_train.mean()
data_std = data_train.std()
data_train = (data_train - data_mean)/data_std # 数据标准化
x_train = data_train[feature].to_numpy() # 属性数据
y_train = data_train['y'].to_numpy() # 标签数据
linearsvr = LinearSVR() # 调用LinearSVR()函数
linearsvr.fit(x_train,y_train)
x = ((data[feature] - data_mean[feature])/data_std[feature]).to_numpy() # 预测,并还原结果。
data['y_pred'] = linearsvr.predict(x) * data_std['y'] + data_mean['y']
outputfile = 'new_reg_data_GM11_revenue.xls' # SVR预测后保存的结果
data.to_excel(outputfile)
print('真实值与预测值分别为:\n',data[['y','y_pred']])
fig = data[['y','y_pred']].plot(subplots = True, style=['b-o','r-*'],title="地方财政收入真实值与预测值对比图3116") # 画出预测结果图
plt.show()
GM21:
import numpy as np
from sympy import *
from sympy.abc import x, y
import re
import math
init_printing()
# 定义符号常量x 与 f(x) g(x)。这里的f g还可以用其他字母替换,用于表示函数
f, g = symbols('f g', cls=Function)
def solving_equation(x1, equation_parameter):
# 二阶齐次微分方程的根分为两个不同的实根、两个相同的实根以及两个虚根
# 不同情况方程的形式不同,根据predict函数中求得的带参数的方程来决定使用的策略
# 下面以两个不同的实根为例
parameter = solve(
[x * math.exp(equation_parameter[0] * 0) + y * math.exp(equation_parameter[1] * 0) + equation_parameter[2] - x1[0],
x * math.exp(equation_parameter[0] * (len(x1)-1)) + y * math.exp(equation_parameter[1] * (len(x1)-1)) +
equation_parameter[2] - x1[len(x1) - 1]])
# 返回X1的预测值
return [parameter[x] * math.exp(equation_parameter[0] * i) + parameter[y] * math.exp(equation_parameter[1] * i) +
equation_parameter[2] for i in range(len(x1))]
def predict(data):
x1 = data.cumsum()
a_x0 = np.ediff1d(data).T
z = (x1[:len(x1) - 1] + x1[1:]) / 2.0
B = np.array([-data[1:], -z, np.ones([len(data) - 1])]).T
Y = a_x0
u = np.dot(np.dot(np.linalg.inv(np.dot(B.T, B)), B.T), Y)
a1, a2, b = u[0], u[1], u[2]
# 用diffeq代表微分方程
diffeq = Eq(f(x).diff(x, x) + a1 * f(x).diff(x) + a2 * f(x), b)
# 调用dsolve函数,返回一个Eq对象,并提取带参数方程
differential_equation = str(dsolve(diffeq, f(x)).args[1])
print(differential_equation)
# 使用正则表达式提取齐次微分方程的根与非齐次微分方程的特解
equation_parameter = re.findall("-?\d+.?\d+", differential_equation.replace(' ', ''))
print(equation_parameter)
# str转为float
for i in range(len(equation_parameter)):
equation_parameter[i] = float(equation_parameter[i])
# 利用边界条件,取X1中第一个数和最后一个数,构造方程组,求参数C1和C2,并返回预测值
return solving_equation(x1, equation_parameter)
if __name__ == '__main__':
inputfile1='new_reg_data.csv'
inputfile2='data.csv'
new_reg_data=pd.read_csv(inputfile1)
data=pd.read_csv(inputfile2)
new_reg_data.index=range(1994,2014)
cols=['x1','x3','x4','x5','x6','x7','x8','x13']
for i in cols:
f=predict(new_reg_data.loc[range(1994,2014),i].values)[0]
new_reg_data.loc[2014,i]=f(len(new_reg_data)-1)
new_reg_data.loc[2015,i]=f(len(new_reg_data))
new_reg_data[i]=new_reg_data[i].round(2)
outputfile='new_reg_data_GM21.xls'
y=list(data['y'].values)
y.extend([np.nan,np.nan])
new_reg_data['y']=y
new_reg_data.to_excel(outputfile)
print('预测结果为:\n',new_reg_data.loc[2014:2015,:])
报错了……
TypeError: 'Float' object is not callable
时间序列:
K-means聚类算法:
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 计算欧拉距离
def calcDis(dataSet, centroids, k):
clalist=[]
for data in dataSet:
diff = np.tile(data, (k, 1)) - centroids #相减 (np.tile(a,(2,1))就是把a先沿x轴复制1倍,即没有复制,仍然是 [0,1,2]。 再把结果沿y方向复制2倍得到array([[0,1,2],[0,1,2]]))
squaredDiff = diff ** 2 #平方
squaredDist = np.sum(squaredDiff, axis=1) #和 (axis=1表示行)
distance = squaredDist ** 0.5 #开根号
clalist.append(distance)
clalist = np.array(clalist) #返回一个每个点到质点的距离len(dateSet)*k的数组
return clalist
# 计算质心
def classify(dataSet, centroids, k):
# 计算样本到质心的距离
clalist = calcDis(dataSet, centroids, k)
# 分组并计算新的质心
minDistIndices = np.argmin(clalist, axis=1) #axis=1 表示求出每行的最小值的下标
newCentroids = pd.DataFrame(dataSet).groupby(minDistIndices).mean() #DataFramte(dataSet)对DataSet分组,groupby(min)按照min进行统计分类,mean()对分类结果求均值
newCentroids = newCentroids.values
# 计算变化量
changed = newCentroids - centroids
return changed, newCentroids
# 使用k-means分类
def kmeans(dataSet, k):
# 随机取质心
centroids = random.sample(dataSet, k)
# 更新质心 直到变化量全为0
changed, newCentroids = classify(dataSet, centroids, k)
while np.any(changed != 0):
changed, newCentroids = classify(dataSet, newCentroids, k)
centroids = sorted(newCentroids.tolist()) #tolist()将矩阵转换成列表 sorted()排序
# 根据质心计算每个集群
cluster = []
clalist = calcDis(dataSet, centroids, k) #调用欧拉距离
minDistIndices = np.argmin(clalist, axis=1)
for i in range(k):
cluster.append([])
for i, j in enumerate(minDistIndices): #enymerate()可同时遍历索引和遍历元素
cluster[j].append(dataSet[i])
return centroids, cluster
# 创建数据集
def createDataSet():
return [[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]]
if __name__=='__main__':
dataset = createDataSet()
centroids, cluster = kmeans(dataset, 2)
print('质心为:%s' % centroids)
print('集群为:%s' % cluster)
for i in range(len(dataset)):
plt.scatter(dataset[i][0],dataset[i][1], marker = 'o',color = 'green', s = 40 ,label = '原始点')
# 记号形状 颜色 点的大小 设置标签
for j in range(len(centroids)):
plt.scatter(centroids[j][0],centroids[j][1],marker='x',color='red',s=50,label='质心')
plt.show()
import numpy as np
import random
import matplotlib.pyplot as plt
def distance(point1, point2): # 计算距离(欧几里得距离)
return np.sqrt(np.sum((point1 - point2) ** 2))
def k_means(data, k, max_iter=10000):
centers = {} # 初始聚类中心
# 初始化,随机选k个样本作为初始聚类中心。 random.sample(): 随机不重复抽取k个值
n_data = data.shape[0] # 样本个数
for idx, i in enumerate(random.sample(range(n_data), k)):
# idx取值范围[0, k-1],代表第几个聚类中心; data[i]为随机选取的样本作为聚类中心
centers[idx] = data[i]
# 开始迭代
for i in range(max_iter): # 迭代次数
print("开始第{}次迭代".format(i+1))
clusters = {} # 聚类结果,聚类中心的索引idx -> [样本集合]
for j in range(k): # 初始化为空列表
clusters[j] = []
for sample in data: # 遍历每个样本
distances = [] # 计算该样本到每个聚类中心的距离 (只会有k个元素)
for c in centers: # 遍历每个聚类中心
# 添加该样本点到聚类中心的距离
distances.append(distance(sample, centers[c]))
idx = np.argmin(distances) # 最小距离的索引
clusters[idx].append(sample) # 将该样本添加到第idx个聚类中心
pre_centers = centers.copy() # 记录之前的聚类中心点
for c in clusters.keys():
# 重新计算中心点(计算该聚类中心的所有样本的均值)
centers[c] = np.mean(clusters[c], axis=0)
is_convergent = True
for c in centers:
if distance(pre_centers[c], centers[c]) > 1e-8: # 中心点是否变化
is_convergent = False
break
if is_convergent == True:
# 如果新旧聚类中心不变,则迭代停止
break
return centers, clusters
def predict(p_data, centers): # 预测新样本点所在的类
# 计算p_data 到每个聚类中心的距离,然后返回距离最小所在的聚类。
distances = [distance(p_data, centers[c]) for c in centers]
return np.argmin(distances)
x=np.random.randint(0,high=10,size=(200,2))
centers,clusters=k_means(x,3)
print(centers)
clusters
for center in centers:
plt.scatter(centers[center][0],centers[center][1],marker='*',s=150)
colors=['r','b','y','m','c','g']
for c in clusters:
for point in clusters[c]:
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.title("可视化聚类图3116")
plt.scatter(point[0],point[1],c=colors[c])
import numpy as np
from sklearn.datasets import make_blobs
n_samples = 1500
random_state = 170
transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
X_aniso = np.dot(X, transformation) # Anisotropic blobs
X_varied, y_varied = make_blobs(
n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
) # Unequal variance
X_filtered = np.vstack(
(X[y == 0][:500], X[y == 1][:100], X[y == 2][:10])
) # Unevenly sized blobs
y_filtered = [0] * 500 + [1] * 100 + [2] * 10
import matplotlib.pyplot as plt
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))
axs[0, 0].scatter(X[:, 0], X[:, 1], c=y)
axs[0, 0].set_title("Mixture of Gaussian Blobs")
axs[0, 1].scatter(X_aniso[:, 0], X_aniso[:, 1], c=y)
axs[0, 1].set_title("Anisotropically Distributed Blobs")
axs[1, 0].scatter(X_varied[:, 0], X_varied[:, 1], c=y_varied)
axs[1, 0].set_title("Unequal Variance")
axs[1, 1].scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_filtered)
axs[1, 1].set_title("Unevenly Sized Blobs")
plt.suptitle("Ground truth clusters3116").set_y(0.95)
plt.show()