简介
使用Numpy、Pandas、自编三种方法计算:平均值、截尾均值、加权平均值、中位数、众数、中列数、极差、四分位数、方差、标准差
实现方法
定义三个类(Numpy_funtion、Pandas_funtion、My_funtion),类中编写数据基本统计方法。方法名与功能如下表
名称 | 功能 | 输入 |
mean | 计算均值 | ( 数据列表 ) |
tmean | 计算截尾均值 | ( 数据列表 ) |
weight_mean | 计算加权平均数 | ( 数据列表,权重列表 ) |
median | 计算中位数 | ( 数据列表 ) |
mode | 计算众数 | ( 数据列表 ) |
midrange | 计算中列数 | ( 数据列表 ) |
range | 计算极差 | ( 数据列表 ) |
quantile | 计算四分位数 | ( 数据列表,[权重列表] ) |
variance | 计算方差 | ( 数据列表 ) |
standard | 计算标准差 | ( 数据列表 ) |
get_timer | 检测方法的效率 | ( 方法,数据列表,[权重列表] ) |
test_all_timer | 测试全部统计方法效率 | ( 数据列表,权重列表,[输出] ) |
测试类-Test | ||
change_parameter | 测试改变传入数据类型 | (数据范围,数据量大小,打印结果) |
change_data_size | 测试数据量不同大小 | (数据量大小,打印结果,绘图) |
统计方法代码
"""
### 中心趋势度量 ###
# mean # 均值
# tmean # 截尾均值
# weight_mean # 加权算数平均数
# median # 中位数
# mode # 众数
# midrange # 中列数
### 数据分散度 ###
# range # 极差
# quantile # 四分位数
# variance # 方差
# standard # 标准差
### 其他 ###
1. __init__(self)
a. 初始化
2. pandas_to_DataFrame(self, data)(仅在pandas类中有)
a. 将数据转换成DataFrame类型
3. get_timer(self, fun, data, weight=[])
a. 参数:传入方法,传输数据,传入权重
b. 作用:用传入的数据测试此方法运行时间
c. 返回值:元组(运行时间,运行结果),如运行失败会返回(False,False)
d. perf_counter_ns返回纳秒时间
4. 方法名:test_all_timer(self, data, weight, out=True)
a. 参数:传入数据,传入权重,是否输出运行结果
b. 作用:测试类中所有统计方法并统计运行时间
c. 返回值:列表;列表每个元素类型为元组(运行时间,运行结果),最后一个元素为(运行总时间(ms),开始时间(ns),结束时间(ns))
d. 返回值2:平均数、截尾平均数、加权平均数、中位数、众数、中列数、极差、四分位数、方差、标准差最后为测试这些方法运行的总时间
"""
class numpy_funtion():
def __init__(self):
import numpy
from time import perf_counter_ns
self.np = numpy
self.perf_counter_ns = perf_counter_ns
def numpy_mean(self, data):
return self.np.mean(data)
def numpy_tmean(self, data):
return self.np.mean(self.np.sort(data)[1:len(data)-1])
def numpy_weight_mean(self, data, weight):
return self.np.average(data, weights=weight)
def numpy_median(self, data):
return self.np.median(data)
def numpy_mode(self, data):
try:
count = self.np.bincount(data)
return self.np.argmax(count)
except :
return 'Failed to find mode, Maybe it contains negative number.'
def numpy_midrange(self, data):
return (self.np.max(data) + self.np.min(data)) / 2
def numpy_range(self, data):
return self.np.max(data) - self.np.min(data)
def numpy_quantile(self, data, weight=[25, 50, 75]):
return self.np.percentile(data, weight)
def numpy_variance(self, data):
return self.np.var(data)
def numpy_standard(self, data):
return self.np.std(data)
def get_timer(self, fun, data, weight=[]):
try:
time_start = self.perf_counter_ns()
ans = fun(data)
except TypeError:
time_start = self.perf_counter_ns()
ans = fun(data, weight)
except :
return False
return (self.perf_counter_ns() - time_start) // 100000, ans
def test_all_timer(self, data, weight, out=True):
if out:
print('--------------- Numpy_Funtion_Finish ---------------')
time_start = self.perf_counter_ns()
result = []
result.append(self.get_timer(self.numpy_mean, data))
result.append(self.get_timer(self.numpy_tmean, data))
result.append(self.get_timer(self.numpy_weight_mean, data, weight))
result.append(self.get_timer(self.numpy_median, data))
result.append(self.get_timer(self.numpy_mode, data))
result.append(self.get_timer(self.numpy_midrange, data))
result.append(self.get_timer(self.numpy_range, data))
result.append(self.get_timer(self.numpy_quantile, data))
result.append(self.get_timer(self.numpy_variance, data))
result.append(self.get_timer(self.numpy_standard, data))
time_end = self.perf_counter_ns()
result.append(((time_end-time_start)//100000, time_start, time_end))
if out:
# print('Data \t\t', data)
# print('Weight \t\t', weight)
print('\t {:<6} \t\t {:<15} {:<20}'.format('名称', '耗时(毫秒ms)', '结果'))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('平均数', result[0][0], result[0][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('截尾平均数', result[1][0], result[1][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('加权平均数', result[2][0], result[2][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('中位数', result[3][0], result[3][1]))
print('\t {:<6} \t\t {:<15} {}'.format('众数', result[4][0], result[4][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('中列数', result[5][0], result[5][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('极差', result[6][0], result[6][1]))
print('\t {:<6} \t\t {:<15} {}'.format('四分位数', result[7][0], result[7][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('方差', result[8][0], result[8][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('标准差', result[9][0], result[9][1]))
print('\t {:<6} \t\t {:<15}'.format('测试时长', result[10][0]))
print('--------------- Numpy_Funtion_Finish ---------------')
print()
return result
class pandas_funtion():
def __init__(self):
import pandas
from time import perf_counter_ns
self.pd = pandas
self.perf_counter_ns = perf_counter_ns
def pandas_to_DataFrame(self, data):
return self.pd.DataFrame(data)
def pandas_mean(self, data):
return data.mean().values[0]
def pandas_tmean(self, data):
tmp = data.sort_values(0)
return tmp.iloc[1:len(data)-1].mean().values[0]
def pandas_weight_mean(self, data, weight):
return ((data * weight).sum() / weight.sum()).values[0]
def pandas_median(self, data):
return data.median().values[0]
def pandas_mode(self, data):
return data.mode().values.tolist()
def pandas_midrange(self, data):
return ((data.max() + data.min()) / 2).values[0]
def pandas_range(self, data):
return (data.max() - data.min()).values[0]
def pandas_quantile(self, data, weight=[0.25, 0.50, 0.75]):
return data.quantile(weight).values.tolist()
def pandas_variance(self, data):
return data.var().values[0]
def pandas_standard(self, data):
return data.std().values[0]
def get_timer(self, fun, data, weight=[]):
try:
time_start = self.perf_counter_ns()
ans = fun(data)
except TypeError:
time_start = self.perf_counter_ns()
ans = fun(data, weight)
except :
return False, False
return (self.perf_counter_ns() - time_start) // 100000, ans
def test_all_timer(self, data, weight, out=True):
if out:
print('--------------- Pandas_Funtion_Start ---------------')
time_start = self.perf_counter_ns()
data = self.pd.DataFrame(data)
weight = self.pd.DataFrame(weight)
result = []
result.append(self.get_timer(self.pandas_mean, data))
result.append(self.get_timer(self.pandas_tmean, data))
result.append(self.get_timer(self.pandas_weight_mean, data, weight))
result.append(self.get_timer(self.pandas_median, data))
result.append(self.get_timer(self.pandas_mode, data))
result.append(self.get_timer(self.pandas_midrange, data))
result.append(self.get_timer(self.pandas_range, data))
result.append(self.get_timer(self.pandas_quantile, data))
result.append(self.get_timer(self.pandas_variance, data))
result.append(self.get_timer(self.pandas_standard, data))
time_end = self.perf_counter_ns()
result.append(((time_end-time_start)//100000, time_start, time_end))
if out:
# print('Data \t\t', data)
# print('Weight \t\t', weight)
print('\t {:<6} \t\t {:<15} {:<20}'.format('名称', '耗时(毫秒ms)', '结果'))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('平均数', result[0][0], result[0][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('截尾平均数', result[1][0], result[1][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('加权平均数', result[2][0], result[2][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('中位数', result[3][0], result[3][1]))
print('\t {:<6} \t\t {:<15} {}'.format('众数', result[4][0], result[4][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('中列数', result[5][0], result[5][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('极差', result[6][0], result[6][1]))
print('\t {:<6} \t\t {:<15} {}'.format('四分位数', result[7][0], result[7][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('方差', result[8][0], result[8][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('标准差', result[9][0], result[9][1]))
print('\t {:<6} \t\t {:<15}'.format('测试时长', result[10][0]))
print('--------------- Pandas_Funtion_Finish ---------------')
print()
return result
class my_funtion():
def __init__(self):
from math import sqrt
from time import perf_counter_ns
self.sqrt = sqrt
self.perf_counter_ns = perf_counter_ns
def my_mean(self, data):
return sum(data)/len(data)
def my_tmean(self, data):
return self.my_mean(sorted(data)[1:len(data)-1])
def my_weight_mean(self, data, weight):
new_data = (d*w for d, w in zip(data, weight))
return sum(new_data)/sum(weight)
def my_median(self, data):
data = sorted(data)
if len(data) & 1:
return data[len(data)//2]
return (data[len(data)//2-1] + data[len(data)//2]) / 2
def my_mode(self, data):
count = {}
for i in data:
if i in count:
count[i] += 1
else:
count[i] = 1
m = max(count.values())
return list((i for i in count if count[i]==m))
def my_midrange(self, data):
return (max(data) + min(data)) / 2
def my_range(self, data):
return max(data) - min(data)
def my_quantile(self, data, weight=[0.25, 0.50, 0.75]):
data = sorted(data)
n = len(data)
locate = (1+(n-1)*w for w in weight) # 计算位置方法一
# locate = ((n+1)*w for w in weight) # 计算位置方法二
ans = []
for w, l in zip(weight, locate): # 建议先学习四分位数值的计算方法
if w > 1-w:
w = 1-w
if int(l) == round(l): # 更接近左边
ans.append(data[int(l)-1]*(1-w) + data[int(l)]*w)
else:
ans.append(data[int(l)-1]*w + data[int(l)]*(1-w))
return ans
def my_variance(self, data):
avg = sum(data) / len(data)
ans = ((i-avg)*(i-avg) for i in data)
return sum(ans) / len(data)
def my_standard(self, data):
return self.sqrt(self.my_variance(data))
def get_timer(self, fun, data, weight=[]):
try:
time_start = self.perf_counter_ns()
ans = fun(data)
except TypeError:
time_start = self.perf_counter_ns()
ans = fun(data, weight)
except:
return False, False
return (self.perf_counter_ns() - time_start) // 100000, ans
def test_all_timer(self, data, weight, out=True):
if out:
print('--------------- My_Funtion_Start --------------------')
time_start = self.perf_counter_ns()
result = []
result.append(self.get_timer(self.my_mean, data))
result.append(self.get_timer(self.my_tmean, data))
result.append(self.get_timer(self.my_weight_mean, data, weight))
result.append(self.get_timer(self.my_median, data))
result.append(self.get_timer(self.my_mode, data))
result.append(self.get_timer(self.my_midrange, data))
result.append(self.get_timer(self.my_range, data))
result.append(self.get_timer(self.my_quantile, data))
result.append(self.get_timer(self.my_variance, data))
result.append(self.get_timer(self.my_standard, data))
time_end = self.perf_counter_ns()
result.append(((time_end-time_start)//100000, time_start, time_end))
if out:
# print('Data \t\t', data)
# print('Weight \t\t', weight)
print('\t {:<6} \t\t {:<15} {:<20}'.format('名称', '耗时(毫秒ms)', '结果'))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('平均数', result[0][0], result[0][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('截尾平均数', result[1][0], result[1][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('加权平均数', result[2][0], result[2][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('中位数', result[3][0], result[3][1]))
print('\t {:<6} \t\t {:<15} {}'.format('众数', result[4][0], result[4][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('中列数', result[5][0], result[5][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('极差', result[6][0], result[6][1]))
print('\t {:<6} \t\t {:<15} {}'.format('四分位数', result[7][0], result[7][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('方差', result[8][0], result[8][1]))
print('\t {:<6} \t\t {:<15} {:<20.4f}'.format('标准差', result[9][0], result[9][1]))
print('\t {:<6} \t\t {:<15}'.format('运行总时长', result[10][0]))
print('--------------- My_Funtion_Finish ------------------')
print()
return result
测试与可视化代码
"""
1. change_parameter:改变数据类型
2. change_data_size:改变数据量大小
3. plot:绘图
"""
class Test():
def __init__(self):
pass
def run_test(self, data, weight, out):
np_f = numpy_funtion()
pd_f = pandas_funtion()
my_f = my_funtion()
np_f.test_all_timer(data, weight, out)
pd_f.test_all_timer(data, weight, out)
my_f.test_all_timer(data, weight, out)
def change_parameter(self, value_range=30, data_size=30, out=True):
import random
data_simple = [12, 16, 37, 46, 59, 72, 89, 100]
weight_simple = list(range(1, len(data_simple)+1))
print('############### 简单测试 ###############')
print('Data:', data_simple, 'Weight:', weight_simple, sep='\n')
self.run_test(data_simple, weight_simple, out)
data_no_negative = [random.randint(0, value_range) for i in range(data_size)]
weight_no_negative = list(range(1, len(data_no_negative)+1))
print('############### 正整数测试 ###############')
print('Data:', data_no_negative, 'Weight:', weight_no_negative, sep='\n')
self.run_test(data_no_negative, weight_no_negative, out)
data_negative = [random.randint(-value_range, value_range) for i in range(data_size)]
weight_negative = list(range(1, len(data_negative)+1))
print('############### 负整数测试 ###############')
print('Data:', data_negative, 'Weight:', weight_negative, sep='\n')
self.run_test(data_negative, weight_negative, out)
data_float = [random.uniform(0, value_range) for i in range(data_size)]
weight_float = list(range(1, len(data_float)+1))
print('############### 浮点数测试 ###############')
print('Data:', data_float, 'Weight:', weight_float, sep='\n')
self.run_test(data_float, weight_float, out)
def change_data_size(self, data_size, out=False, is_plot=True):
import random
np_f = numpy_funtion()
pd_f = pandas_funtion()
my_f = my_funtion()
for i in data_size:
data = [random.randint(0, i) for i in range(i)]
weight = [random.randint(0, i) for i in range(i)]
n_data = np_f.test_all_timer(data, weight, out)
p_data = pd_f.test_all_timer(data, weight, out)
m_data = my_f.test_all_timer(data, weight, out)
if is_plot:
self.plot(i, n_data, p_data, m_data)
def plot(self, data_size, np_d, pd_d, my_d): # Visualization
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
title_label = '数据量为 {} 的运行时间'.format(data_size) # 标题
xticks_en = ['mean', 'tmean', 'weight_mean', 'median', 'mode', 'midrange',
'range', 'quantile', 'variance', 'standard'] # x轴刻度(英语版)
xticks_ch = ['均值', '截尾均值', '加权均值', '中位数', '众数', '中列数', '极差',
'四分位数', '方差', '标准差'] # x轴刻度(中文版)
x = np.arange(10) # x轴
ny = [i[0] for i in np_d] # y轴1
py = [i[0] for i in pd_d] # y轴2
my = [i[0] for i in my_d] # y轴3
ny.pop(), py.pop(), my.pop() # 删除最后一个元素(总运行时间)
width, distance = 0.25, 0.25 # 柱宽度,距离
fig = plt.figure()
ax = fig.add_subplot(111)
ax.bar(x-distance, ny, width, facecolor='#9999ff', label='Numpy')
ax.bar(x, py, width, facecolor='lightgreen', label='Pandas')
ax.bar(x+distance, my, width, facecolor='#ff9999', label='My')
ax.set(xticks=x, title=title_label, xlabel='方法/函数', ylabel='运行时间(ms)')
ax.set_xticklabels(xticks_ch, rotation=30)
ax.legend()
for x, y1, y2, y3 in zip(x, ny, py, my):
ax.text(x-distance, y1, y1, ha='center', va='bottom')
ax.text(x, y2, y2, ha='center', va='bottom')
ax.text(x+distance, y3, y3, ha='center', va='bottom')
plt.show()
误差分析
- 众数(mode):Numpy类中的众数方法(mode)无法很好的处理负数和小数,多个值出现次数一样时只能返回一个结果。
原因:numpy中没有直接查找众数的函数,这里用的是bincount()函数,返回的值也是一个数组,这个数组比较特殊,数组的数值对应的索引在原数组中出现的次数。 - 方差(variance):Pandas 计算方差的方法返回结果略有不同
原因:Pandas 中计算方差为 无偏样本方差,公式为:
Numpy 和自定义方法计算方差是样本方差本身,公式为:
自编方法使用的是和Numpy相同的公式。
运行截图
正整数测试
100000000个元素