机器学习两周学习成果

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
# 1.获取数据集
iris = load_iris()
# 2.数据基本处理
# x_train,x_test,y_train,y_test为训练集特征值、测试集特征值、训练集目标值、测试集目标值
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=22)
# 3、特征工程：标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4、机器学习(模型训练)
estimator = KNeighborsClassifier(n_neighbors=9)
estimator.fit(x_train, y_train)
# 5、模型评估
# 方法1：比对真实值和预测值
y_predict = estimator.predict(x_test)
print("预测结果为:\n", y_predict)
print("比对真实值和预测值：\n", y_predict == y_test)
# 方法2：直接计算准确率
score = estimator.score(x_test, y_test)
print("准确率为：\n", score)

1.1.3 实验结果

机器学习两周学习成果_facebook

1.2 案例2 预测facebook签到位置

1.2.1 数据集介绍

大赛的目的是预测一个人想签入到哪个地方。对于本次比赛的目的，Facebook的创建一个人造的世界，包括位于10公里的10平方公里超过10万米的地方。对于一个给定的坐标，你的任务是返回最有可能的地方的排名列表。数据制作出类似于来自移动设备的位置的信号，给你需要什么与不准确的，嘈杂的价值观复杂的真实数据工作一.番风味。不-致的和错误的位置数据可能破坏，如Facebook入住服务经验。
数据介绍：将根据用户的位置，准确性和时间戳预测用户正在查看的业务。

train.csv，test.csv
row_id：登记事件的ID
xy：坐标
准确性：定位准确性
时间：时间戳
place_id：业务的ID，这是您预测的目标

1.2.2 代码过程

# 1、获取数据集
facebook = pd.read_csv("./data/FBlocation/train.csv")
2.基本数据处理
# 2.基本数据处理
# 2.1 缩小数据范围
facebook_data = facebook.query("x>2.0 & x<2.5 & y>2.0 & y<2.5")
# 2.2 选择时间特征
time = pd.to_datetime(facebook_data["time"], unit="s")
time = pd.DatetimeIndex(time)
facebook_data["day"] = time.day
facebook_data["hour"] = time.hour
facebook_data["weekday"] = time.weekday
# 2.3 去掉签到较少的地方
place_count = facebook_data.groupby("place_id").count()
place_count = place_count[place_count["row_id"]>3]
facebook_data = facebook_data[facebook_data["place_id"].isin(place_count.index)]
# 2.4 确定特征值和目标值
x = facebook_data[["x", "y", "accuracy", "day", "hour", "weekday"]]
y = facebook_data["place_id"]
# 2.5 分割数据集
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

# 3.特征工程--特征预处理(标准化)
# 3.1 实例化一个转换器
transfer = StandardScaler()
# 3.2 调用fit_transform
x_train = transfer.fit_transform(x_train)
x_test = transfer.fit_transform(x_test)

# 4.机器学习--knn+cv
# 4.1 实例化一个估计器
estimator = KNeighborsClassifier()
# 4.2 调用gridsearchCV
param_grid = {"n_neighbors": [1, 3, 5, 7, 9]}
estimator = GridSearchCV(estimator, param_grid=param_grid, cv=5)
# 4.3 模型训练
estimator.fit(x_train, y_train)

# 5.模型评估
# 5.1 预测值输出
y_pre = estimator.predict(x_test)
print("预测值为:\n", y_pre)

# 5.2 score
score = estimator.score(x_test, y_test)
print("准确率为:\n", score)
# 5.3 其他评价指标
print("最好的模型：\n", estimator.best_estimator_)
print("最好的结果:\n", estimator.best_score_)
print("整体模型结果:\n", estimator.cv_results_)

1.2.3 实验结果

机器学习两周学习成果_人工智能_02

机器学习两周学习成果_数据_03

2. 线性回归原理学习和案例完成

2.1 案例波士顿房价预测

2.1.1 数据介绍

属性名	解释	类型
CRIM	该镇的人均犯罪率	连续值
ZN	占地面积超过25,000平方的住宅用地比例	连续值
INDUS	非零售商业用地比例	连续值
CHAS	是否邻近Charles River	离散值，1=邻近; 0=不邻近
NOX	一氧化氮浓度	连续值
RM	每栋房屋的平均客房数	连续值
AGE	1940年之前建成的自用单位比例	连续值
DIS	到波士顿5个就业中心的加权距离	连续值
RAD	到径向公路的可达性指数	连续值
TAX	全值财产税率	连续值
PTRATIO	学生与教师的比例	连续值
B	1000(BK - 0.63)^2,其中BK为黑人占比	连续值
LSTAT	低收入人群占比	连续值
MEDV	同类房屋价格的中位数	连续值

2.2.2 代码过程

def linear_model1():
    """
    线性回归:正规方程
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)


def linear_model2():
    """
    线性回归:梯度下降法
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = SGDRegressor(max_iter=1000)
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)

2.2.3 实验结果

机器学习两周学习成果_数据集_04

机器学习两周学习成果_数据集_05

机器学习两周学习成果_人工智能_06

机器学习两周学习成果_facebook_07

3.逻辑回归原理学习和案例实现

3.1案例癌症分类预测-良／恶性乳腺癌肿瘤预测

3.1.1 数据介绍

（1）699条样本，共11列数据，第一列用语检索的id，后9列分别是与肿瘤相关的医学特征，最后一列表示肿瘤类型的数值。
（2）包含16个缺失值，用”?”标出。

3.1.2 代码过程

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# 1.获取数据
names = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
                   'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin',
                   'Normal Nucleoli', 'Mitoses', 'Class']

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                  names=names)
data.head()
# 2.基本数据处理
# 2.1 缺失值处理
data = data.replace(to_replace="?", value=np.NaN)
data = data.dropna()
# 2.2 确定特征值,目标值
x = data.iloc[:, 1:10]
x.head()
y = data["Class"]
y.head()
# 2.3 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 3.特征工程(标准化)
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 4.机器学习(逻辑回归)
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
# 5.模型评估
y_predict = estimator.predict(x_test)
y_predict
estimator.score(x_test, y_test)

3.3.3 实验结果

机器学习两周学习成果_人工智能_08

机器学习两周学习成果_数据集_09

4.决策树原理学习和案例实现

4.1 案例：泰坦尼克号乘客生存预测

4.1.1 数据介绍

在泰坦尼克号和titanic2数据帧描述泰坦尼克号上的个别乘客的生存状态。这里使用的数据集是由各种研究人员开始的。其中包括许多研究人员创建的旅客名单，由Michael A. Findlay编辑。提取的数据集中的特征是票的类别，存活，乘坐班，年龄，登陆，home.dest，房间，票，船和性别。

4.1.2 代码过程

import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# 1、获取数据
titan = pd.read_csv("http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt")
# 2.数据基本处理
# 2.1 确定特征值,目标值
x = titan[["pclass", "age", "sex"]]
y = titan["survived"]
# 2.2 缺失值处理
# 缺失值需要处理，将特征当中有类别的这些特征进行字典特征抽取
x['age'].fillna(x['age'].mean(), inplace=True)
# 2.3 数据集划分
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)
# 3.特征工程(字典特征抽取)

# 特征中出现类别符号，需要进行one-hot编码处理(DictVectorizer)

x.to_dict(orient="records") 需要将数组特征转换成字典数据

# 对于x转换成字典数据x.to_dict(orient="records")
# [{"pclass": "1st", "age": 29.00, "sex": "female"}, {}]

transfer = DictVectorizer(sparse=False)

x_train = transfer.fit_transform(x_train.to_dict(orient="records"))
x_test = transfer.fit_transform(x_test.to_dict(orient="records"))
# 4.决策树模型训练和模型评估
# 决策树API当中，如果没有指定max_depth那么会根据信息熵的条件直到最# 终结束。这里我们可以指定树的深度来进行限制树的大小

# 4.机器学习(决策树)
estimator = DecisionTreeClassifier(criterion="entropy", max_depth=5)
estimator.fit(x_train, y_train)

# 5.模型评估
ret=estimator.score(x_test, y_test)
ret
y_pre = estimator.predict(x_test)
print(y_pre)

4.3.3 实验结果

机器学习两周学习成果_数据集_10

机器学习两周学习成果_数据集_11

5.聚类原理学习和案例实现

5.1 案例：探究用户对物品类别的喜好细分降维

5.1.1 数据介绍

order_products__prior.csv：订单与商品信息

字段：order_id, product_id, add_to_cart_order, reordered

products.csv：商品信息

字段：product_id, product_name, aisle_id, department_id

orders.csv：用户的订单信息

字段：order_id,user_id,eval_set,order_number,….

aisles.csv：商品所属具体物品类别

字段： aisle_id, aisle

5.1.2 代码过程

import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# 1.获取数据
order_product = pd.read_csv("./data/instacart/order_products__prior.csv")
products = pd.read_csv("./data/instacart/products.csv")
orders = pd.read_csv("./data/instacart/orders.csv")
aisles = pd.read_csv("./data/instacart/aisles.csv")
# 2.数据基本处理

# 2.1 合并表格
# 2.1 合并表格
table1 = pd.merge(order_product, products, on=["product_id", "product_id"])
table2 = pd.merge(table1, orders, on=["order_id", "order_id"])
table = pd.merge(table2, aisles, on=["aisle_id", "aisle_id"])
# 2.2 交叉表合并
table = pd.crosstab(table["user_id"], table["aisle"])
# 2.3 数据截取
table = table[:1000]
# 3.特征工程 — pca

transfer = PCA(n_components=0.9)
data = transfer.fit_transform(table)
# 4.机器学习（k-means）

estimator = KMeans(n_clusters=8, random_state=22)
estimator.fit_predict(data)
# 5.模型评估

silhouette_score(data, y_predict)

5.3.3 实验结果

机器学习两周学习成果_数据_12

6.其它成果

阅读内容

《机器学习》（周志华）部分章节
《Python机器学习基础教程》（Sarah Guido）部分章节
推荐系统、深度学习、交通预测相关的论文

上一篇：onedrive 配置教你如何获取5T windows资源存储空间和提高数据同步速度

下一篇：[JVM]类（Class）文件结构

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯