F E − R e n t L i s t i n g I n q u e r i e s FE-RentListingInqueries FE−RentListingInqueries
Rent Listing Inqueries 数据集的特征工程
import必要的工具包,用于文件读取/特征编码
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from nltk.metrics import distance as distance
#from sklearn.cross_validation import StratifiedKFold
from sklearn.model_selection import StratifiedKFold
from MeanEncoder import MeanEncoder
数据文件路径和文件名
#input data
dpath = './data/'
train = pd.read_json(dpath +"RentListingInquries_train.json")
test = pd.read_json(dpath+"RentListingInquries_test.json")
标签interest_level
将类别型的标签interest_level编码为数字
从前面的分析和常识来看,listing_id对确定interest_level没有用,去掉
特征编码对训练集和测试集都要做,所以干脆将二者连起来一起处理
y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])
#y_train = train.interest_level.values
y_train = train.interest_level
train = train.drop(['listing_id', 'interest_level'], axis=1)
listing_id = test.listing_id.values
test = test.drop('listing_id', axis=1)
ntrain = train.shape[0]
train_test = pd.concat((train, test), axis=0).reset_index(drop=True)
y_train
price, bathrooms, bedrooms
数值型特征,+/-/*/ /
特征的单调变换对XGBoost不必要
#remove some noise
#ulimit = np.percentile(train_test.price.values, 99)
train_test['price'].ix[train_test['price']>13000] = 13000
# remove some noise
train_test.loc[train_test["bathrooms"] == 112, "bathrooms"] = 1.5
train_test.loc[train_test["bathrooms"] == 10, "bathrooms"] = 1
train_test.loc[train_test["bathrooms"] == 20, "bathrooms"] = 2
train_test['price_bathrooms'] = (train_test["price"])/ (train_test["bathrooms"] +1.0)
train_test['price_bedrooms'] = (train_test["price"])/ (train_test["bedrooms"] +1.0)
train_test["room_diff"] = train_test["bathrooms"] - train_test["bedrooms"]
train_test["room_num"] = train_test["bedrooms"] + train_test["bathrooms"]
创建日期
train_test['Date'] = pd.to_datetime(train_test['created'])
train_test['Year'] = train_test['Date'].dt.year
train_test['Month'] = train_test['Date'].dt.month
train_test['Day'] = train_test['Date'].dt.day
train_test['Wday'] = train_test['Date'].dt.dayofweek
train_test['Yday'] = train_test['Date'].dt.dayofyear
train_test['hour'] = train_test['Date'].dt.hour
train_test = train_test.drop(['Date', 'created'], axis=1)
description
# count of words present in description column #
train_test["num_description_words"] = train_test["description"].apply(lambda x: len(x.split(" ")))
train_test = train_test.drop(['description'], axis=1)
manager_id
将manager分为几个等级
top 1%, 2%, 5, 10, 15, 20, 25, 30, 50,
managers_count = train_test['manager_id'].value_counts()
train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
managers_count.values >= np.percentile(managers_count.values, 70)] else 0)
building_id
类似manager_id处理
buildings_count = train_test['building_id'].value_counts()
train_test['top_10_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_test['top_25_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_test['top_5_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_test['top_50_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_test['top_1_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_test['top_2_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_test['top_15_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_test['top_20_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_test['top_30_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)
photos
train_test['photos_count'] = train_test['photos'].apply(lambda x: len(x))
train_test.drop(['photos'], axis=1, inplace=True)
latitude, longtitude
聚类降维编码(#用训练数据训练,对训练数据和测试数据都做变换)
到中心的距离(论坛上讨论到曼哈顿中心的距离更好)
# Clustering
train_location = train_test.loc[:ntrain-1, ['latitude', 'longitude']]
test_location = train_test.loc[ntrain:, ['latitude', 'longitude']]
kmeans_cluster = KMeans(n_clusters=20)
res = kmeans_cluster.fit(train_location)
res = kmeans_cluster.predict( pd.concat((train_location, test_location), axis=0).reset_index(drop=True))
train_test['cenroid'] = res
# L1 distance
center = [ train_location['latitude'].mean(), train_location['longitude'].mean()]
train_test['distance'] = abs(train_test['latitude'] - center[0]) + abs(train_test['longitude'] - center[1])
display_address
train_test['display_address'] = train_test['display_address'].apply(lambda x: x.lower().strip())
street_address
train_test['street_address'] = train_test['street_address'].apply(lambda x: x.lower().strip())
类别型特征
LableEncoder
categoricals = ['building_id', 'manager_id', 'display_address', 'street_address']
#categoricals = [x for x in train_test.columns if train_test[x].dtype == 'object']
for feat in categoricals:
lbl = LabelEncoder()
lbl.fit(list(train_test[feat].values))
train_test[feat] = lbl.transform(list(train_test[feat].values))
定义高基数类别型特征编码函数
(manager_id, building_id, display_address,street_address )
对这些特征进行均值编码(该特征值在每个类别的概率,即原来的一维特征变成了C-1维特征,C为标签类别数目)
me = MeanEncoder(categoricals)
#trian
#import pdb
#pdb.set_trace()
train_new = train_test.iloc[:ntrain, :]
train_new_cat = me.fit_transform(train_new, y_train)
#test
test_new = train_test.iloc[ntrain:, :]
test_new_cat = me.transform(test_new)
train_test = pd.concat((train_new_cat, test_new_cat), axis=0).reset_index(drop=True)
train_test.drop(categoricals, axis=1, inplace=True)
features
描述特征文字长度
特征中单词的词频,相当于以数据集features中出现的词语为字典的one-hot编码(虽然是词频,但在这个任务中每个单词)
train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))
c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect_sparse = c_vect.fit_transform(train_test['features2'])
c_vect_sparse_cols = c_vect.get_feature_names()
train_test.drop(['features', 'features2'], axis=1, inplace=True)
#hstack作为特征处理的最后一部,先将其他所有特征都转换成数值型特征才能处理
train_test_sparse = sparse.hstack([train_test, c_vect_sparse]).tocsr()
特征处理结果存为文件
y_train
#存为csv格式方便用excel查看
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]
train_new = pd.concat((X_train, y_train), axis=1).reset_index(drop=True)
train_new.to_csv(dpath + 'RentListingInquries_FE_train.csv', index=False)
X_test.to_csv(dpath + 'RentListingInquries_FE_test.csv', index=False)
from scipy.io import mmwrite
X_train_sparse = train_test_sparse[:ntrain, :]
X_test_sparse = train_test_sparse[ntrain:, :]
train_sparse = sparse.hstack([X_train_sparse, sparse.csr_matrix(y_train).T]).tocsr()
mmwrite(dpath + 'RentListingInquries_FE_train.txt',train_sparse)
mmwrite(dpath + 'RentListingInquries_FE_test.txt',X_test_sparse)
#存为libsvm稀疏格式,直接调用XGBoost的话用稀疏格式更高效
#from sklearn.datasets import dump_svmlight_file
#dump_svmlight_file(, y_train, dpath + 'RentListingInquries_FE_train.txt',X_train_sparse)
#dump_svmlight_file(X_test_sparse, dpath + 'RentListingInquries_FE_test.txt')
train_test_new = pd.DataFrame(train_test_sparse.toarray())
X_train = train_test_new.iloc[:ntrain, :]
X_test = train_test_new.iloc[ntrain:, :]
train_new = pd.concat((X_train, y_train), axis=1)