文章目录
- 踩坑:
- python data_stru
- list
- tuple
- dict
- set
- 特殊语句
- try.except for
- with:
- practical manupulation
- string's operations
- latex output
- conventional operations
- file_operations
- matplotlib
- data_clear
- matplotlib
- data_clear
学过一遍的吧.csv读成.xlsx,就花了1h,纯纯的笑话捏
踩坑:
- matplotlib与latex联用,需要download mixtex+ghostscript
- Python中单引号和双引号的区别 - 知乎 (zhihu.com)
- reshape(-1,1)=automatically calculate row’s amount
- combination——hstack([h,w]); vstack([h,w])
- read_excel(‘xxx.xlsx’,header=None)
python data_stru
range(1,10,2)#不含末尾
import numpy as np
np.linspace(1,10,9)
list
print(::step) ##从首元素起以step为步长
a=[1,2,3]
#increase list elements
a.append(4)
a.extend([3,2,1])
a.insert(0,-1)
#delete list elements
a.pop() #delete ultimate element
a.pop(2) #delete third element
a.remove(3) #delete element=3,if 3 doesn't exist,report error
a.clear() #clear list
#external operations
a.index(e) #look for e's subscript
a.count(e) #count e's appreaing times at list
a+b #connect a and b,不同类型的列表拼接
b*n #repeat b for n times
b.sort() #sort list in ascending order
a=sorted(b); a=sorted(b,reverse=True)
a.reverse() #reverse order
tuple
a=(1,2,3)
b=3,
#sole-two operations
a.index(3)
b.count(1)
dict
- 1) key value is sole; 2) a={a:1,b:2};
a[0:3]
set
- every element is sole
#create set
a=set('abcde') #a=set()
a={1,2,3,4}
a.add(5)
a.remove(4) #4 does't exist,report error
a.discard()
a.clear()
b=a.copy() #copy rather than direct b to a's content
b=a.pop() #repel first element
a.update(t) #a=a+t
a.different(t) #a=a-t
a.intersection(t) #a=aUt
a.symmetric.difference(t) #a=a^t
c=a.union(t) #c=a+t
- 这些容器可以按索引或标签取元素+赋值修改(apart from tuple)
特殊语句
try.except for
- 不因为可能出现的错误而中止程序
with:
- 这样就可以连续展示多张图片啦
practical manupulation
string’s operations
- can asscociate with list/tuple/dict operations
a=' obligation '
len('happy')
a.count('o')
eval('21+32')
ind=a.find('ob')
list2=a.split(i)
#strip 剥去:the mover striped the house of furniture
a.strp(['i'])
a.strip() #remove front or back spaces
sec=['a','sdf','asdfd']
print('hdx'.join(sec)) #opposition of strip
- r:去转义,b:转为bytes对象(网络编程),u:中文字符前避免存储格式问题
- print(f"{name} done in {time.time() - t0:.2f} s.")
- format({:.2f})
- https://blog.51cto.com/u_15127632/2739868
latex output
from IPython.display import Latex
Latex(r'\alpah={}'.format(123))
Latex(r'\alpah={0}'.format(123))
Latex(r'\alpah={name}'.format(name=123))
import math
import latexify
@latexify.with_latex
def f_1(x):
if x==1:
return 1
elif x==2:
return 2
else:
return f_1(x-1) + f_1(x-2)
print(f_1)
f_1
conventional operations
#map:function effect at list
a=map(pow,range(6),[2 for b in range(6)])
list(a) #[0,1,4,9,16,25]
#reduce:function with 2 parameters
print(reduce(lambda x,y:x+y,range(1,6)))
#filter:return result=true's elements's iterators,——quickly choose special elements
print(list(filter(lambda n:n%3==0,range(1,21))))#ps:iterator can's be print directly
#zip combine lists into element's group
a=range(1,5)
b=range(5,9)
list(zip(*zip)) #disassemble tuple list into tuple
#enumerate枚举可迭代对象:list/tuple/...
print([value[0]for (int,value)in enumerate(a)])
- list([]) convert content/iterators in brackets into list
- tuple producer similars to list producer
file_operations
- 从文件(txt,xsl,)IO
#txt,不可包含其他符号
import pandas as pd
a=pd.read_csv("pd1.txt",parse_dates={'birthday':[0,1,2]})
b=a.values
c=a.line_name
f=open("pd11","w")
f.write(b) //f.writelines(str1)
f.close()
#xlsx,
a=pd.read_excel("pd2.xlsx",uscols=range(1,4))
b=a.values
#acquire data subset,iloc按索引,loc还可按tag
b1=a.iloc[np.arange(6),[0,1]] #[行标签,列标签]
b2=a.loc[np.arange(7),["player2","player1"]]
c=a.describe() #表格描述性统计
d=pd.DataFrame(b,index=np.arange(1,11),columns=["user1","user2"]) #build DataFrame
#write c to excel(f)
f=pd.ExcelWriter('Pd22.xlsx')
c.to_excel(f,"sheet1")
c.to_excel(f,"sheet2")
f.save()
- normal r/w
- file_management
import os
a1=os.listdir(c:\\);print(a)
os.rename("t1.txt","t2.txt")
os.mkdir("bull")
os.chdir("d:\\target_category")
os.getcwd() #get current category
os.rmdir("for_deleting_category_name")
matplotlib
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lFaZe1mA-1678630994144)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20230116121513686.png)]
#a和plt都是图,plt可以操控子图
import matplotlib.pyplot as plt
s=plt.figure()
a=s.add_subplot(2,5,6)
#designate axes and line
a.set_xticks(list,label list,rotation=20,fontsize='small')
a.set_xtitle(label list,rotation=20,fontsize='small')
a.set_xticklabels()
a.set_title()
plt.plot(np.random.randn(50),bins=20,color='k',alpha=0.3,label='republican')#plot(x,y,attributes) x can be neglected
#another way to design axs/line
plt.rc('lines', linewidth = 4); plt.rc('axes', prop_cycle = sample_cycler); plt.rc('font', weight ='bold')
plt.rc('xtick.major', size = 5, pad = 7); plt.rc('xtick', labelsize = 15)
#piece out figure into arrays
fig,axes=plt.subplots(2,3)
axes[i,j].hist(np.random) #hist=直方图,scatter()=散点图,plot=折线图
plt.subplots_adjust(wsapce=0,hspace=0);
#display legend
plt.legend(label=['tag1','tag2','tag3'],loc='best')
#正常显示中文
plt.rc('font' , family=' SimHei' ) #用来正常显示中文标签
#substitution
plt.rcParams['font.sans-serif']=['SimHei']
#用来正常显示负号
plt.rc('axes ',xxnicode_minu8-Fal8e)
#display latex
plt.rc('text',usetex=True)
#fill_between import numpy as np
x2=linspace(za[-1],4,100); y2=norm.pdf(x2); y1=[0]*len(x2)
fill_between(x2,y1,y2,color='g')
# conventional static graph
plt.hist(h,interval_number)
plt.boxplot(hw,label=['height','weight'])
#经验分布函数
plt.hist(w,20,density=True(面积的积分为1),histtype='step'(直方图形状,default='bar',cumulative=True)
#Q-Q:经验分布和理论分布的分位数点作为一对array draw at the 直角coordinate system(两者近似相等的依据?拟合统计量小,求导,分位点近似相等)
plt.plot(yi,sh,'o',label='QQ图')
#或调用library function
from scipy.stats import probplot
res=proplot(h,plot=plt) plt.show()
data_clear
import pandas as pd
#1. 重复处理
print(any(a.duplicated())); a.drop_duplicates(inplace=True)
#2. 缺失
print(any(a.isnull()));
#delete 缺失值
excel.dropna()=excel.drop(axis=0,how='any',thresh=None);
excel.dropna(axis=0/1,how='any/all'(有一个/全部NONE就删除axis,thresh=3));
excel.drop('user B',axis=1)
#3. data_fill
b1=a.fillna(0); b2=fillna(method='ffill'); b3=fillna(method='bfill'); #front/back row
b4=a.fillna(value={'gender':a.gender.mode()[0],'age':a.age.mean(),'a.income.median()'})
#众数,均值,中位数
#4. 插值法a.attribute.interpolate()
b5=a.fillna(value={'age':a.age.interpolate(method='polynomial',order=2),'income':a.income.interpolate()})
#二次插值,一次
#5. abnormal value process
a.counts.mean(); a.counts.std()
Q1=a.counts.quantile(0.25)
IQR=a.counts.quantile(0.75)-Q1
any(a.counts>mu+2*s); any(a.counts>Q#+1.5*IQR)
st=a.counts[a.counts<UB].max() #convey a bool array,找出低于判别上界的最大值
a.loc[a.counts>UB,'counts']=st #高于判别上界的赋为st,a.counts>UB,'counts'分别为选中的行,列
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VDd2PzcQ-1678630994380)(null)]686.png]
txt]()
- file_management
import os
a1=os.listdir(c:\\);print(a)
os.rename("t1.txt","t2.txt")
os.mkdir("bull")
os.chdir("d:\\target_category")
os.getcwd() #get current category
os.rmdir("for_deleting_category_name")
matplotlib
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存失败,源站可能有防盗链机制,建议将图片保存下来直接上传下上传(iIMfHYB8FPnt-1628991101)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20278630974349)(C:/Users/c1826/AppData/Roaming/Typora/typora-user-images/image-20230116121513686.png)]
#a和plt都是图,plt可以操控子图
import matplotlib.pyplot as plt
s=plt.figure()
a=s.add_subplot(2,5,6)
#designate axes and line
a.set_xticks(list,label list,rotation=20,fontsize='small')
a.set_xtitle(label list,rotation=20,fontsize='small')
a.set_xticklabels()
a.set_title()
plt.plot(np.random.randn(50),bins=20,color='k',alpha=0.3,label='republican')#plot(x,y,attributes) x can be neglected
#another way to design axs/line
plt.rc('lines', linewidth = 4); plt.rc('axes', prop_cycle = sample_cycler); plt.rc('font', weight ='bold')
plt.rc('xtick.major', size = 5, pad = 7); plt.rc('xtick', labelsize = 15)
#piece out figure into arrays
fig,axes=plt.subplots(2,3)
axes[i,j].hist(np.random) #hist=直方图,scatter()=散点图,plot=折线图
plt.subplots_adjust(wsapce=0,hspace=0);
#display legend
plt.legend(label=['tag1','tag2','tag3'],loc='best')
#正常显示中文
plt.rc('font' , family=' SimHei' ) #用来正常显示中文标签
#substitution
plt.rcParams['font.sans-serif']=['SimHei']
#用来正常显示负号
plt.rc('axes ',xxnicode_minu8-Fal8e)
#display latex
plt.rc('text',usetex=True)
#fill_between import numpy as np
x2=linspace(za[-1],4,100); y2=norm.pdf(x2); y1=[0]*len(x2)
fill_between(x2,y1,y2,color='g')
# conventional static graph
plt.hist(h,interval_number)
plt.boxplot(hw,label=['height','weight'])
#经验分布函数
plt.hist(w,20,density=True(面积的积分为1),histtype='step'(直方图形状,default='bar',cumulative=True)
#Q-Q:经验分布和理论分布的分位数点作为一对array draw at the 直角coordinate system(两者近似相等的依据?拟合统计量小,求导,分位点近似相等)
plt.plot(yi,sh,'o',label='QQ图')
#或调用library function
from scipy.stats import probplot
res=proplot(h,plot=plt) plt.show()
data_clear
import pandas as pd
#1. 重复处理
print(any(a.duplicated())); a.drop_duplicates(inplace=True)
#2. 缺失
print(any(a.isnull())); dropna()=drop(axis=0,how='any',thresh=None); #delete 缺失值
dropna(axis=0/1,how='any/all'(有一个/全部NONE就删除axis,thresh=3)); drop('user B',axis=1)
#3. data_fill
b1=a.fillna(0); b2=fillna(method='ffill'); b3=fillna(method='bfill'); b4=a.fillna(value={'gender':a.gender.mode()[0],'age':a.age.mean(),'a.income.median()'})
#4. 插值法a.attribute.interpolate()
b5=a.fillna(value={'age':a.age.interpolate(methoed='polynomial',order=2),'income':a.income.interpolate()})
#5. abnormal value process
a.counts.mean(); a.counts.std()
Q1=a.counts.quantile(0.25)
IQR=a.counts.quantile(0.75)-Q1
any(a.counts>mu+2*s); any(a.counts>Q#+1.5*IQR)
st=a.counts[a.counts<UB].max() #convey a bool array
a.loc[a.counts>UB,'counts']=st #a.counts>UB,'counts'分别为选中的行,列