从py文件导入模块,保证目录正确(spyder set console working directory)

from mdlp import MDLP
var=MDLP()
dataframe 基本
sai
#变量类型
train.info()
data.dtypes
data['date']=data['date'].astype(np.int)
data['date'] = data['date'].str.replace('%','').astype(np.float64)
#特定类型统一操作
def ob_float(data):
data_dtype=data.dtypes
data_object=data_dtype[data_dtype=='object'].index
for col in data_object[1:]:
data[col] = data[col].str.replace('%','').astype(np.float64)
return data
#查看唯一值
np.unique(data_new.num)
#百分比
data20wsum.describe(percentiles=[.10, .20, .30,0.40])
data_last32.mean()
#众数
data_last32.mode().iloc[0]
import pandas as pd
import numpy as np
df.head()
data_last3_mindata.shape[1]
data_last3_mindata.shape[0]
#pyecharts作图
pddt.values
pddt.index
#注意数据格式
datatest是2*5的数据框
v3=datatest.values[1] #float 长度5
v2 = [[0.94, 0.79, 0.92, 0.93, 0.69]] #list长度1
#一个list 一个float 但是长度一致 √
v4=[5, 20, 36, 10, 75, 90]
v5=pddt.values[1]
#列:
df['A']
df.loc[:,['A','B']]
data.loc[(data["Gender"]=="Female") & (data["Education"]=="Not Graduate") & (data["Loan_Status"]=="Y"),
["Gender","Education","Loan_Status"]]
data['date']=data['num'].apply(lambda x: x[3:6])

#单列用作index

ValueError: Buffer has wrong number of dimensions (expected 1, got 2)

错误 train=data.iloc[train_index,:] #即使只有一列,也不能直接把df做为索引

正确 train=data.iloc[train_index.Resample1,:]

#行:

df[0:3]

df['20130102':'20130104']

df.iloc[3]

#一列

data_last['C7'].iloc[1]

#####iloc 与loc

#选取特定index

data['num'].loc[select_index]

data522a523.loc[3192576430792]

#注意,iloc是将序列当作数组来访问,下标又会从0开始:

data['num'].iloc[[2,3,4]]

删除pandas DataFrame的某一/几列:

方法一:直接del DF['column-name']

方法二:采用drop方法,有下面三种等价的表达式:

1. DF= DF.drop('column_name', 1);

2. DF.drop('column_name',axis=1, inplace=True)

3. DF.drop(DF.columns[ : ], axis=1,inplace=True) # Note: zero indexed

注意:凡是会对原数组作出修改并返回一个新数组的,往往都有一个 inplace可选参数。如果手动设定为True(默认为False),那么原数组直接就被替换。也就是说,采用inplace=True之后,原数组名(如2和3情况所示)对应的内存值直接改变;而采用inplace=False之后,原数组名对应的内存值并不改变,需要将新的结果赋给一个新的数组或者覆盖原数组的内存位置(如1情况所示)。

df.iloc[3:5,0:2]

#满足条件:

df[df.A>0]
df[df>0]
#针对series
data_model523=data_copy['sum']
data_copy_unsatisf_mode523=data_model523[data_model523<65 ]
data_last3_min21['U26'][(data_last3_min21['U26']>test[4]) & (data_last3_min21['U26']<=test[5])]=-6
[pandas多条件行数据筛选]()
print df[(df['PCTL']<0.95) & (df['PCTL']>0.05)]

如果是要对element-wise作判断,考虑用df.map

如果是行列运算考虑用df.apply + lambda

data20wsum=data20wsum.sort_values()

obj4 = obj3.reset_index(drop=True)

#错误方式

df_train=df_train.reindex()
columns = data_last_res.columns.values.tolist()
columns.index('C7')
dataframe常见操作
###len(x)
data.groupby('date').count()['num']
data.groupby('date').count()['num'].plot(kind='bar')
data_20W_maxdata=data_20W_max.groupby('num').min(skipna=True)
data_last=data_new.groupby('num').mean() #mean不要有(skipna=True)
##group min不足 一列,多列排序后保留唯一值
data = data_k.sort_values(['num', 'sum'], ascending=[0, 0])
data_3= data.drop_duplicates(['num'])
#分组条件统计 可以先条件后统计 特别注意cci_all.drop(cci_all[cci_all['sum']<30].index,0)的index都是唯一值(合并dataframe组成的要重置)
cci_all_date=cci_all.groupby('date').count()['num']
cci_all_s= cci_all.drop(cci_all[cci_all['sum']<30].index,0)
cci_all_s_num=cci_all_s.groupby('date').count()['num']
cci_all_res = pd.DataFrame({'all':cci_all_date,'comp':cci_all_s_num})
cci_all_res['satis']=cci_all_res['all'] - cci_all_res['comp']
cci_all_res['accuracy']=cci_all_res['satis']/cci_all_res['all']
data= data.drop(data[data['date']<500].index,0)
data=data.reset_index(drop=True)

注意:

train2=df2.drop(['Pclass','Sex','Cabin','Embarked','Name','Ticket','PassengerId'], axis=1, inplace=True)

删除列前面不可以写=

df2.drop(['Pclass','Sex','Cabin','Embarked','Name','Ticket','PassengerId'], axis=1, inplace=True)

但是filter可以

train_df = df.filter(regex='Survived|Age_.*|SibSp|Parch|Fare_.*|Cabin_.*|Embarked_.*|Sex_.*|Pclass_.*')

比drop的好处就是 df可以复用

data_complain=df_train.merge(data, on='num',how='left')

#series合并为df

c = pd.DataFrame([a,b])

c = pd.DataFrame({'a':a,'b':b})

#以某一列为索引

data_20W=data_20W.set_index('num')

#获取索引 list方便显示

testindex=list(data_last_res.index)

testindex=data_last_res.index #set 方便交集等运算

#抽样

data_last_res=data_last_res.sample(frac=1,random_state=123).reset_index(drop=True)

import copy

origin = [1, 2, [3, 4]]

#origin 里边有三个元素:1, 2,[3, 4]

cop1 = copy.copy(origin)

cop2 = copy.deepcopy(origin)

#透视表

#Determine pivot table
impute_grps = data.pivot_table(values=["LoanAmount"], index=["Gender","Married","Self_Employed"], aggfunc=np.mean)
print impute_grps
pd.crosstab(data["Credit_History"],data["Loan_Status"],margins=True)
def percConvert(ser):
return ser/float(ser[-1])
pd.crosstab(data["Credit_History"],data["Loan_Status"],margins=True).apply(percConvert, axis=1)
不断添加dataframe
data_k=pd.DataFrame()
data_k= pd.concat([data_k,data_complain_531],ignore_index=True)
data_k= pd.concat([data_k,data_complain_531],axis=1) #横着连
[【pandas】[3] DataFrame 数据合并,连接(merge,join,concat)]()
DataFrame.mean(axis=0,skipna=True)
data_num1['C12'].min(skipna=True)
data_score['sum']=data_score0.sum(1,skipna=True)

读存数据

长数字作为字符读入

data0=pd.read_csv('%s\%s'%(root,fn),usecols=[0,27],dtype={'num':str})

encoding='gbk'

read_csv只是把数据读作df,和读入文件格式无关,

target_url = ("https://archive.ics.uci.edu/ml/machine-learning-"

"databases/undocumented/connectionist-bench/sonar/sonar.all-data")

rocksVMines = pd.read_csv(target_url,header=None, prefix="V")

wine = pd.read_csv(target_url,header=0, sep=";")

data = urllib2.urlopen(target_url)

abalone = pd.read_csv(target_url,header=None, prefix="V")

列名

abalone.columns = ['Sex', 'Length', 'Diameter', 'Height', 'Whole weight',
'Shucked weight', 'Viscera weight', 'Shell weight',
'Rings']
#读取指定列,使用这个参数可以加快加载速度并降低内存消耗。
data = pd.read_csv('data.csv',usecols=[0,1,2,3])
data=pd.read_csv('C:/Users/maojh/Documents/jm/data_score_rate_yidong2.csv',usecols=range(6))
#index=False 首列1234
data_last.to_csv('C:/Users/maojh/Documents/jm/model_data/data522_09042.csv',index=False)
data_last.to_csv('C:/Users/maojh/Documents/jm/pd.csv')
# python xxx.py inputfilename.csv
input_path = sys.argv[1]
output_path = input_path + 'result.csv'
data0=pd.read_csv(input_path)
data['sum'].to_csv(output_path, index=False)
##重复数据
#R语言
rep(c(1,3),each=4)
#python
lis = ["A", "B"]
times = (2, 3)
sum(([x]*y for x,y in zip(lis, times)),[])
['A', 'A', 'B', 'B', 'B']
list(chain.from_iterable([[num]*3 for num in data]))
from itertools import chain, izip, starmap
from operator import mul
list(chain.from_iterable(starmap(mul, izip(lis, times))))
####每列唯一值及其个数
#批量
import os
source = "C:\\Users\\maojh\\Documents\\jm\\csv"
lst0=[]
per_test0=[]
for root, dirs, files in os.walk(source):
for fn in files :
data0=pd.read_csv('%s\\%s'%(root,fn),header=None,skiprows=1,names=name)
data0['date_string']=fn[0:3]
data0=ob_float(data0)
lst0.append(data0)
data0_1=data0.iloc[:,1:]
data0_percentest=data0_1.max() #检查百分数还是小数
per_test0.append(data0_percentest)
np.save("C:\\Users\\maojh\\Documents\\jm\\jm725_5_data_new.npy",data_new)
numpy.load("filename")
读取json文件
f=open('C:/Users/xx/Documents/heat.json','r')
histdataM1=json.loads(f.read())
f.close()
heat=histdataM1['data']
import pandas
#list转dataframe
test2=pandas.DataFrame(heat, columns=['lon', 'lat', 'number'])
test2['number'].quantile(0.2)
排序
df.sort_values(['high', '日序'], ascending=[True, False])
百分数等数字object转换
data_complain_514['C5'] = data_complain_514['C5'].str.replace('%','').astype(np.float64)
spider绘图单独显示
tools-preference-ipython/python console-graphics-backend-automatic
editor background
tools-preference-syntax
df.groupby('key1').mean()
def min_max(data):
max_data=data.max(skipna=True)
min_data=data.min(skipna=True)
if max_data==min_data:
data[data.notnull()]=1
else:
data[data.notnull()]=(data-min_data)/(max_data-min_data)
return data
#test=min_max(data_num1['C2'])
data_num1_minmax=data_num1.iloc[:,2:60].apply(min_max)
#重复项
x, idx = np.unique(data.num, return_index=True)
idx_data=range(7137)
get_double=[]
for i in idx_data:
if i not in idx:
get_double.append(i)
#缺失值统计
#Create a new function:
def num_missing(x):
return sum(x.isnull())
#Applying per column:
print "Missing values per column:"
print data.apply(num_missing, axis=0) #axis=0 defines that function is to be applied on each column
#Applying per row:
print "\nMissing values per row:"
print data.apply(num_missing, axis=1).head() #axis=1 defines that function is to be applied on each row
#缺失统计
def sum_na(data):
return data.isnull().sum()
data_new['column_na']= data_new.apply(sum_na,axis=1)
data_new['count_na']=1
miss_data_row=pd.pivot_table(data_new,index=["column_na"],values=["count_na"],aggfunc=np.sum)
miss_data_row.sort_index(ascending=False)
miss_data_row.sort_index(by='count_na',ascending=False).head(20)
total= df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total','Percent'])
missing_data.head(20)
df_train= df_train.drop((missing_data[missing_data['Total'] > 50000]).index,1)
sum_na_col=data_last.apply(lambda x: sum(x.isnull()),axis=0)
per_na_col=sum_na_col/data_last.shape[0]
sum_na_row=data_last.apply(lambda x: sum(x.isnull()),axis=1)
#每列唯一值
data_k.apply(lambda x: len(x.unique()),axis=0)
plt.hist(sum_na_row)
sum_na_row=sum_na_row.tolist()
count_na_row = {k:sum_na_row.count(k) for k in set(sum_na_row)}
#缺失值众数填充
data['Gender'].fillna(mode(data['Gender'])[0][0], inplace=True)
#保存模型
import statsmodels.api as sm
# 指定作为训练变量的列,不含目标列`admit`
train_cols = data.columns[1:]
logit = sm.Logit(data['aim'], data[train_cols])
# 拟合模型
result = logit.fit()
# 查看数据的要点
print result.summary()
from sklearn.externals import joblib
joblib.dump(result, 'lr.model')
result = joblib.load('lr.model')

法二:

https://www.zhihu.com/question/31604690

#区间替代
data['d']=data['c'].map(lambda x: 0 if x<5 else (1 if x<10 else 2))
#交集等操作 x需针对set
index522=data_copy_unsatisf_mode522.index
index523=data_copy_unsatisf_mode523.index
index522and523=index522 & index523
index522and523=list(index522and523)
#差集
index522only=list(index522.difference(index523))

1. 获取两个list 的交集

#方法一:

a=[2,3,4,5]

b=[2,5,8]

tmp = [val for val in a if val in b]

print tmp

#[2, 5]

#方法二

list(set(a).intersection(set(b)))

2. 获取两个list 的并集

list(set(a).union(set(b)))

3.获取两个 list 的差集

list(set(b).difference(set(a))) # b中有而a中没有的

#变量类型

for i, row in colTypes.iterrows(): #i: dataframe index; row: each row in series format

if row['feature']=="category":

data[row['feature']]=data[row['feature']].astype(np.object)

elif row['feature']=="cont":

data[row['feature']]=data[row['feature']].astype(np.float)

print data.dtypes

返回value_counts

source = "E:\CCI_minmax"
data_k=pd.DataFrame()
for root, dirs, files in os.walk(source):
for fn in files :
data0=pd.read_csv('%s\%s'%(root,fn),usecols=range(1,27))
column_names=data0.columns.tolist()
data_date=pd.DataFrame()
for col_name in column_names:
val_count=data0[col_name].value_counts()
data_date=pd.concat([data_date,val_count.sort_index()])
data_date.columns=[fn[0:6]]
data_k=pd.concat([data_k,data_date],axis=1)