

import warnings

import numpy as np
import pandas as pd
import os
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d

filepath = os.path.join("../dataset/Internet Advertisements/Data Folder","ad.data")

def Converter_number(x):
        return np.float64(x)
    except ValueError:
        return np.nan

converters = {key:Converter_number for key in range(1558)}
converters[1558] = lambda x: 1 if x.strip()=='ad.' else 0
ads = pd.read_csv(filepath,header=None,converters=converters)
#     0      1       2     3     4     5     6     7     8     9     ...   1549  \
# 0  125.0  125.0  1.0000   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
# 1   57.0  468.0  8.2105   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
# 2   33.0  230.0  6.9696   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
# 3   60.0  468.0  7.8000   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
# 4   60.0  468.0  7.8000   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   
#    1550  1551  1552  1553  1554  1555  1556  1557  1558  
# 0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0     1  
# 1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0     1  
# 2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0     1  
# 3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0     1  
# 4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0     1  
# [5 rows x 1559 columns]


ads = ads.dropna(axis=0)


#              0            1            2            3            4     \
# count  2359.000000  2359.000000  2359.000000  2359.000000  2359.000000   
# mean     63.912251   155.631624     3.912982     0.759644     0.002120   
# std      54.881130   130.237867     6.047220     0.427390     0.045999   
# min       1.000000     1.000000     0.001500     0.000000     0.000000   
# 25%      25.000000    80.500000     1.033450     1.000000     0.000000   
# 50%      51.000000   110.000000     2.111100     1.000000     0.000000   
# 75%      84.000000   184.000000     5.333300     1.000000     0.000000   
# max     640.000000   640.000000    60.000000     1.000000     1.000000   
#          5            6            7            8            9     \
# count  2359.0  2359.000000  2359.000000  2359.000000  2359.000000   
# mean      0.0     0.006359     0.004663     0.004663     0.014837   
# std       0.0     0.079504     0.068141     0.068141     0.120925   
# min       0.0     0.000000     0.000000     0.000000     0.000000   
# 25%       0.0     0.000000     0.000000     0.000000     0.000000   
# 50%       0.0     0.000000     0.000000     0.000000     0.000000   
# 75%       0.0     0.000000     0.000000     0.000000     0.000000   
# max       0.0     1.000000     1.000000     1.000000     1.000000   
#           ...              1549         1550         1551         1552  \
# count     ...       2359.000000  2359.000000  2359.000000  2359.000000   
# mean      ...          0.003815     0.001272     0.002120     0.002543   
# std       ...          0.061662     0.035646     0.045999     0.050379   
# min       ...          0.000000     0.000000     0.000000     0.000000   
# 25%       ...          0.000000     0.000000     0.000000     0.000000   
# 50%       ...          0.000000     0.000000     0.000000     0.000000   
# 75%       ...          0.000000     0.000000     0.000000     0.000000   
# max       ...          1.000000     1.000000     1.000000     1.000000   
#               1553         1554         1555        1556         1557  \
# count  2359.000000  2359.000000  2359.000000  2359.00000  2359.000000   
# mean      0.008478     0.013989     0.014837     0.00975     0.000848   
# std       0.091705     0.117470     0.120925     0.09828     0.029111   
# min       0.000000     0.000000     0.000000     0.00000     0.000000   
# 25%       0.000000     0.000000     0.000000     0.00000     0.000000   
# 50%       0.000000     0.000000     0.000000     0.00000     0.000000   
# 75%       0.000000     0.000000     0.000000     0.00000     0.000000   
# max       1.000000     1.000000     1.000000     1.00000     1.000000   
#               1558  
# count  2359.000000  
# mean      0.161509  
# std       0.368078  
# min       0.000000  
# 25%       0.000000  
# 50%       0.000000  
# 75%       0.000000  
# max       1.000000  
# [8 rows x 1559 columns]
from sklearn.grid_search import GridSearchCV

fig = plt.figure()
df = (ads[0].sort_values().values)[:,np.newaxis]
grid_param = {
kde_grid = GridSearchCV(KernelDensity(),grid_param)
kde = kde_grid.fit(df).best_estimator_

df = (ads[1].sort_values().values)[:,np.newaxis]
grid_param = {
kde_grid = GridSearchCV(KernelDensity(),grid_param)
kde = kde_grid.fit(df).best_estimator_

df = (ads[2].sort_values().values)[:,np.newaxis]
grid_param = {
kde_grid = GridSearchCV(KernelDensity(),grid_param)
kde = kde_grid.fit(df).best_estimator_

df = (ads[3].sort_values().values)[:,np.newaxis]
grid_param = {
kde_grid = GridSearchCV(KernelDensity(),grid_param)
kde = kde_grid.fit(df).best_estimator_

from collections import Counter
df1558 = ads[1558].values
# c = Counter(df1558)
# plt.bar(list(c.keys()),list(c.values()))
fig = plt.figure()





if __name__ == "__main__":


上述代码中,我们主要用到了KernelDensity函数,是一个核密度估计函数。kernel density estimation是在概率论中用来估计未知的密度函数,属于非参数检验方法之一,由Rosenblatt (1955)和Emanuel Parzen(1962)提出,又名Parzen窗(Parzen window)。Ruppert和Cline基于数据集密度函数聚类算法提出修订的核密度估计方法。




均匀核函数 k(x)=1/2,-1≤x≤1 加入带宽h后: kh(x)=1/(2h),-h≤x≤h

三角 核函数 k(x)=1-|x|,-1≤x≤1 加入带宽h后: kh(x)=(h-|x|)/h^2,-h≤x≤h

伽马核函数 kxi(x)=[x^(α-1)exp{-xα/xi}]/[(xi/α)^α.Γ(α)]





bandwidth : float

The bandwidth of the kernel.

algorithm : string

The tree algorithm to use. Valid options are [‘kd_tree’|’ball_tree’|’auto’]. Default is ‘auto’.

kernel : string

The kernel to use. Valid kernels are [‘gaussian’|’tophat’|’epanechnikov’|’exponential’|’linear’|’cosine’] Default is ‘gaussian’.

metric : string

The distance metric to use. Note that not all metrics are valid with all algorithms. Refer to the documentation of BallTree and KDTree for a description of available algorithms. Note that the normalization of the density output is correct only for the Euclidean distance metric. Default is ‘euclidean’.

atol : float

The desired absolute tolerance of the result. A larger tolerance will generally lead to faster execution. Default is 0.

rtol : float

The desired relative tolerance of the result. A larger tolerance will generally lead to faster execution. Default is 1E-8.

breadth_first : boolean

If true (default), use a breadth-first approach to the problem. Otherwise use a depth-first approach.

leaf_size : int

Specify the leaf size of the underlying tree. See BallTree or KDTree for details. Default is 40.

metric_params : dict

Additional parameters to be passed to the tree for use with the metric. For more information, see the documentation of BallTree or KDTree.



fit(X[, y])

Fit the Kernel Density model on the data.


Get parameters for this estimator.

sample([n_samples, random_state])

Generate random samples from the model.

score(X[, y])

Compute the total log probability under the model.


Evaluate the density model on the data.


Set the parameters of this estimator.
