文章目录

  • 踩坑:
  • python data_stru
  • list
  • tuple
  • dict
  • set
  • 特殊语句
  • try.except for
  • with:
  • practical manupulation
  • string's operations
  • latex output
  • conventional operations
  • file_operations
  • matplotlib
  • data_clear
  • matplotlib
  • data_clear


学过一遍的吧.csv读成.xlsx,就花了1h,纯纯的笑话捏

踩坑:

  1. matplotlib与latex联用,需要download mixtex+ghostscript
  2. Python中单引号和双引号的区别 - 知乎 (zhihu.com)
  3. reshape(-1,1)=automatically calculate row’s amount
  4. combination——hstack([h,w]); vstack([h,w])
  5. read_excel(‘xxx.xlsx’,header=None)

python data_stru

range(1,10,2)#不含末尾
import numpy as np
np.linspace(1,10,9)

list

print(::step)	##从首元素起以step为步长
a=[1,2,3]
#increase list elements
a.append(4)
a.extend([3,2,1])
a.insert(0,-1)
#delete list elements
a.pop()		#delete ultimate element
a.pop(2)  	#delete third element
a.remove(3) #delete element=3,if 3 doesn't exist,report error
a.clear()	#clear list
#external operations
a.index(e)	#look for e's subscript
a.count(e)	#count e's appreaing times at list
a+b			#connect a and b,不同类型的列表拼接
b*n			#repeat b for n times
b.sort()	#sort list in ascending order
a=sorted(b);	a=sorted(b,reverse=True)
a.reverse()	#reverse order

tuple

a=(1,2,3)
b=3,
#sole-two operations
a.index(3)
b.count(1)

dict

  • 1) key value is sole; 2) a={a:1,b:2}; a[0:3]

set

  • every element is sole
#create set
a=set('abcde')	#a=set()
a={1,2,3,4}
a.add(5)
a.remove(4)		#4 does't exist,report error
a.discard()		
a.clear()
b=a.copy()		#copy rather than direct b to a's content
b=a.pop()		#repel first element
a.update(t)		#a=a+t
a.different(t)	#a=a-t
a.intersection(t)	#a=aUt
a.symmetric.difference(t)	#a=a^t
c=a.union(t)	#c=a+t
  • 这些容器可以按索引或标签取元素+赋值修改(apart from tuple)

特殊语句

try.except for

  • 不因为可能出现的错误而中止程序

datahub订阅后python消费 data python_显示中文

with:

  • 这样就可以连续展示多张图片啦

datahub订阅后python消费 data python_数据分析_02

practical manupulation

string’s operations

  • can asscociate with list/tuple/dict operations
a=' obligation	'
len('happy')
a.count('o')
eval('21+32')
ind=a.find('ob')
list2=a.split(i)
#strip 剥去:the mover striped the house of furniture
a.strp(['i'])
a.strip()	#remove front or back spaces
sec=['a','sdf','asdfd']
print('hdx'.join(sec))	#opposition of strip

latex output

from IPython.display import Latex
Latex(r'\alpah={}'.format(123))
Latex(r'\alpah={0}'.format(123))
Latex(r'\alpah={name}'.format(name=123))
import math 
import latexify 
@latexify.with_latex
def f_1(x):
    if x==1:
        return 1
    elif x==2:
        return 2
    else:
        return f_1(x-1) + f_1(x-2)
print(f_1)
f_1

conventional operations

#map:function effect at list
a=map(pow,range(6),[2 for b in range(6)])
list(a)	#[0,1,4,9,16,25]
#reduce:function with 2 parameters
print(reduce(lambda x,y:x+y,range(1,6)))
#filter:return result=true's elements's iterators,——quickly choose special elements
print(list(filter(lambda n:n%3==0,range(1,21))))#ps:iterator can's be print directly
#zip combine lists into element's group
a=range(1,5)
b=range(5,9)
list(zip(*zip))	#disassemble tuple list into tuple
#enumerate枚举可迭代对象:list/tuple/...
print([value[0]for (int,value)in enumerate(a)])
  • list([]) convert content/iterators in brackets into list
  • tuple producer similars to list producer

file_operations

  • 从文件(txt,xsl,)IO
#txt,不可包含其他符号
import pandas as pd
a=pd.read_csv("pd1.txt",parse_dates={'birthday':[0,1,2]})
b=a.values
c=a.line_name
f=open("pd11","w")
f.write(b)	//f.writelines(str1)
f.close()
#xlsx,
a=pd.read_excel("pd2.xlsx",uscols=range(1,4))
b=a.values
#acquire data subset,iloc按索引,loc还可按tag
b1=a.iloc[np.arange(6),[0,1]]	#[行标签,列标签]
b2=a.loc[np.arange(7),["player2","player1"]]
c=a.describe()	#表格描述性统计
d=pd.DataFrame(b,index=np.arange(1,11),columns=["user1","user2"])	#build DataFrame
#write c to excel(f)
f=pd.ExcelWriter('Pd22.xlsx')
c.to_excel(f,"sheet1")
c.to_excel(f,"sheet2")
f.save()
  • normal r/w

datahub订阅后python消费 data python_python_03

datahub订阅后python消费 data python_datahub订阅后python消费_04

  • file_management
import os
a1=os.listdir(c:\\);print(a)
os.rename("t1.txt","t2.txt")
os.mkdir("bull")
os.chdir("d:\\target_category")
os.getcwd()	#get current category
os.rmdir("for_deleting_category_name")

matplotlib

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lFaZe1mA-1678630994144)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20230116121513686.png)]

#a和plt都是图,plt可以操控子图
import matplotlib.pyplot as plt
s=plt.figure()
a=s.add_subplot(2,5,6)
#designate axes and line
a.set_xticks(list,label list,rotation=20,fontsize='small')
a.set_xtitle(label list,rotation=20,fontsize='small')
a.set_xticklabels()
a.set_title()
plt.plot(np.random.randn(50),bins=20,color='k',alpha=0.3,label='republican')#plot(x,y,attributes)	x can be neglected
#another way to design axs/line
plt.rc('lines', linewidth = 4);	plt.rc('axes', prop_cycle = sample_cycler);	plt.rc('font', weight ='bold')
plt.rc('xtick.major', size = 5, pad = 7);	plt.rc('xtick', labelsize = 15)
#piece out figure into arrays
fig,axes=plt.subplots(2,3)
axes[i,j].hist(np.random)	#hist=直方图,scatter()=散点图,plot=折线图
plt.subplots_adjust(wsapce=0,hspace=0);	
#display legend
plt.legend(label=['tag1','tag2','tag3'],loc='best')
#正常显示中文
plt.rc('font' , family=' SimHei' ) #用来正常显示中文标签
#substitution
plt.rcParams['font.sans-serif']=['SimHei']
#用来正常显示负号
plt.rc('axes ',xxnicode_minu8-Fal8e)
#display latex
plt.rc('text',usetex=True)
#fill_between import numpy as np
x2=linspace(za[-1],4,100);	y2=norm.pdf(x2);	y1=[0]*len(x2)
fill_between(x2,y1,y2,color='g')
# conventional static graph
plt.hist(h,interval_number)
plt.boxplot(hw,label=['height','weight'])
#经验分布函数
plt.hist(w,20,density=True(面积的积分为1),histtype='step'(直方图形状,default='bar',cumulative=True)
#Q-Q:经验分布和理论分布的分位数点作为一对array draw at the 直角coordinate system(两者近似相等的依据?拟合统计量小,求导,分位点近似相等)
plt.plot(yi,sh,'o',label='QQ图')
#或调用library function
from scipy.stats import probplot         
res=proplot(h,plot=plt)         plt.show()

data_clear

import pandas as pd
#1. 重复处理
print(any(a.duplicated()));	a.drop_duplicates(inplace=True)
#2. 缺失
print(any(a.isnull()));	
#delete 缺失值
excel.dropna()=excel.drop(axis=0,how='any',thresh=None);	
excel.dropna(axis=0/1,how='any/all'(有一个/全部NONE就删除axis,thresh=3));	
excel.drop('user B',axis=1)
#3. data_fill
b1=a.fillna(0);	b2=fillna(method='ffill');	b3=fillna(method='bfill');	#front/back row	
b4=a.fillna(value={'gender':a.gender.mode()[0],'age':a.age.mean(),'a.income.median()'})
#众数,均值,中位数
#4. 插值法a.attribute.interpolate()
b5=a.fillna(value={'age':a.age.interpolate(method='polynomial',order=2),'income':a.income.interpolate()})
#二次插值,一次
#5. abnormal value process
a.counts.mean();	a.counts.std()
Q1=a.counts.quantile(0.25)
IQR=a.counts.quantile(0.75)-Q1
any(a.counts>mu+2*s);	any(a.counts>Q#+1.5*IQR)
st=a.counts[a.counts<UB].max()	#convey a bool array,找出低于判别上界的最大值               
a.loc[a.counts>UB,'counts']=st	#高于判别上界的赋为st,a.counts>UB,'counts'分别为选中的行,列

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VDd2PzcQ-1678630994380)(null)]686.png]

txt]()

  • file_management
import os
a1=os.listdir(c:\\);print(a)
os.rename("t1.txt","t2.txt")
os.mkdir("bull")
os.chdir("d:\\target_category")
os.getcwd()	#get current category
os.rmdir("for_deleting_category_name")

matplotlib

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存失败,源站可能有防盗链机制,建议将图片保存下来直接上传下上传(iIMfHYB8FPnt-1628991101)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20278630974349)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20230116121513686.png)]

#a和plt都是图,plt可以操控子图
import matplotlib.pyplot as plt
s=plt.figure()
a=s.add_subplot(2,5,6)
#designate axes and line
a.set_xticks(list,label list,rotation=20,fontsize='small')
a.set_xtitle(label list,rotation=20,fontsize='small')
a.set_xticklabels()
a.set_title()
plt.plot(np.random.randn(50),bins=20,color='k',alpha=0.3,label='republican')#plot(x,y,attributes)	x can be neglected
#another way to design axs/line
plt.rc('lines', linewidth = 4);	plt.rc('axes', prop_cycle = sample_cycler);	plt.rc('font', weight ='bold')
plt.rc('xtick.major', size = 5, pad = 7);	plt.rc('xtick', labelsize = 15)
#piece out figure into arrays
fig,axes=plt.subplots(2,3)
axes[i,j].hist(np.random)	#hist=直方图,scatter()=散点图,plot=折线图
plt.subplots_adjust(wsapce=0,hspace=0);	
#display legend
plt.legend(label=['tag1','tag2','tag3'],loc='best')
#正常显示中文
plt.rc('font' , family=' SimHei' ) #用来正常显示中文标签
#substitution
plt.rcParams['font.sans-serif']=['SimHei']
#用来正常显示负号
plt.rc('axes ',xxnicode_minu8-Fal8e)
#display latex
plt.rc('text',usetex=True)
#fill_between import numpy as np
x2=linspace(za[-1],4,100);	y2=norm.pdf(x2);	y1=[0]*len(x2)
fill_between(x2,y1,y2,color='g')
# conventional static graph
plt.hist(h,interval_number)
plt.boxplot(hw,label=['height','weight'])
#经验分布函数
plt.hist(w,20,density=True(面积的积分为1),histtype='step'(直方图形状,default='bar',cumulative=True)
#Q-Q:经验分布和理论分布的分位数点作为一对array draw at the 直角coordinate system(两者近似相等的依据?拟合统计量小,求导,分位点近似相等)
plt.plot(yi,sh,'o',label='QQ图')
#或调用library function
from scipy.stats import probplot         
res=proplot(h,plot=plt)         plt.show()

data_clear

import pandas as pd
#1. 重复处理
print(any(a.duplicated()));	a.drop_duplicates(inplace=True)
#2. 缺失
print(any(a.isnull()));	dropna()=drop(axis=0,how='any',thresh=None);	#delete 缺失值
dropna(axis=0/1,how='any/all'(有一个/全部NONE就删除axis,thresh=3));	drop('user B',axis=1)
#3. data_fill
b1=a.fillna(0);	b2=fillna(method='ffill');	b3=fillna(method='bfill');	b4=a.fillna(value={'gender':a.gender.mode()[0],'age':a.age.mean(),'a.income.median()'})
#4. 插值法a.attribute.interpolate()
b5=a.fillna(value={'age':a.age.interpolate(methoed='polynomial',order=2),'income':a.income.interpolate()})
#5. abnormal value process
a.counts.mean();	a.counts.std()
Q1=a.counts.quantile(0.25)
IQR=a.counts.quantile(0.75)-Q1
any(a.counts>mu+2*s);	any(a.counts>Q#+1.5*IQR)
st=a.counts[a.counts<UB].max()	#convey a bool array               
a.loc[a.counts>UB,'counts']=st	#a.counts>UB,'counts'分别为选中的行,列