图像分类又叫图像识别,是计算机视觉中的重要任务。在这个任务中,我们假设每张图像只包含一个对象。图像分类分为两种,一种叫二分类,一种叫多分类。
我们将覆盖以下内容:
- 探索数据集
- 创建一个数据集
- 划分数据集
- 数据预处理
- 创建数据读取器
- 构建分类模型
- 定义损失函数
- 定义优化器
- 模型训练与评估
- 模型部署
- 在测试集进行模型推理
探索数据集
数据准备
Histopathologic Cancer Detection数据集 下载以后,解压到文件夹名为data中。
在data文件夹下,有两个文件夹:train和test。train文件夹下包含96x96大小的220025张.tif的图像。图像文件名就是图像的ID。train_labels.csv文件提供了train文件夹中图片的真实值。
- 读取train_labels.csv并且打印它的头:
import pandas as pd
path2csv = "./data/train_labels.csv"
labels_df = pd.read_csv(path2csv)
labels_df.head()
- 打印各类别的数目
print(labels_df['label'].value_counts())
# 0 130908
# 1 89117
# Name: label, dtype:int64
- 查看标签直方图
labels_df['label'].hist()
- 可视化图像
import matplotlib.pylab as plt
from PIL import Image, ImageDraw
import numpy as np
import os
# %matplotlib inline
# 获得有害图片的IDs
# get IDs for malignant images
malignantIds = labels_df.loc[labels_df['label']==1]['id'].values
# data is stored here
path2train = "./data/train/"
# show images in grayscale, if you want color change it to True
color = False
# set figure size
plt.rcParams['figure.figsize'] = (10.0, 10.0)
plt.subplots_adjust(wspace=0, hspace=0)
nrows,ncols=3,3
# display the images
for i, id_ in enumerate(malignantIds[:nrows*ncols]):
full_filenames = os.path.join(path2train, id_ + '.tif')
# load image
img = Image.open(full_filenames)
# draw a 32*32 rectangle
draw = ImageDraw.Draw(img)
draw.rectangle(((32, 32), (64, 64)), outline="green")
plt.subplot(nrows, ncols, i+1)
if color is True:
plt.imshow(np.array(img))
else:
plt.imshow(np.array(img)[:,:,0], cmap="gray")
plt.axis('off')
plt.show()
- 获取图片形状大小以及最大最小像素值
print("image shape:", np.array(img).shape)
print("pixel values range from %s to %s" % (np.min(img), np.max(img)))
# image shape: (96, 96, 3)
# pixel values range from 0 to 255
创建自己的数据集
我们可以通过PyTorch Dataset类来创建自定义Dataset类。创建自定义数据集类时,请确保定义两个基本函数:len__和__getitem。__len__函数返回数据集的长度,这个函数可以通过Python的len函数调用;__getitem__函数返回指定索引的图像。
- 首先,导入相关包,定义histCancerDataset类:
from PIL import Image
import torch
from torch.utils.data import Dataset
import pandas as pd
import torchvision.transforms as transforms
import os
# dont forget to fix the random seed for reproducibility
# fix torch random seed
torch.manual_seed(0)
class histCancerDataset(Dataset):
def __init__(self, data_dir, transform, data_type="train"):
# path to images
path2data = os.path.join(data_dir, data_type)
# get a list of images
filenames = os.listdir(path2data)
# get the full path to images
self.full_filenames = [os.path.join(path2data, f) for f in filenames]
# labels are in a csv file named train_labels.csv
csv_filename = data_type + "_labels.csv"
path2csvLabels = os.path.join(data_dir, csv_filename)
labels_df = pd.read_csv(path2csvLabels)
# set data frame index to id
labels_df.set_index("id", inplace=True)
# obtain labels from data frame
self.labels = [labels_df.loc[filename[:-4]].values[0] for filename in filenames]
self.transform = transform
def __len__(self):
# return size of dataset
return len(self.full_filenames)
def __getitem__(self, idx):
# open image, apply transforms and return with label
image = Image.open(self.full_filenames[idx]) # PIL image
image = self.transform(image)
return image, self.labels[idx]
- 图片变换
import torchvision.transforms as transforms
data_transformer = transforms.Compose([
transforms.ToTensor()
])
- 定义一个普通数据集的对象
data_dir = "./data/"
histo_dataset = histCancerDataset(data_dir, data_transformer, "train")
print(len(histo_dataset))
# 220025
- 使用自定义的类读取一张图片
#load an image
img,label = histo_dataset[9]
print(img.shape, torch.min(img), torch.max(img))
# torch.Size([3, 96, 96]) tensor(0.) tensor(1.)
拆分数据集
我们将把数据集分为训练集和验证集,同时显示分别显示一些样本。
- 拆分histo_dataset
from torch.utils.data import random_split
len_histo = len(histo_dataset)
len_train = int(0.8 * len_histo)
len_val = len_histo - len_train
train_ds, val_ds = random_split(histo_dataset, [len_train, len_val])
print("train dataset length: ", len(train_ds))
print("val dataset length: ", len(val_ds))
# train dataset length: 176020
# val dataset length: 44005
- 获取训练集中的一张图片
for x, y in train_ds:
print(x.shape, y)
break
# torch.Size([3, 96, 96]) 1
- 获取验证集中的一张图片
for x, y in val_ds:
print(x.shape, y)
break
# torch.Size([3, 96, 96]) 1
- 显示训练集中的一些样本图片
# import the required package
from torchvision import utils
import numpy as np
import matplotlib.pyplot as plt
# %matplotlib inline
np.random.seed(0)
# define a function to show image:
def show(img, y, color=False):
# convert tensor to numpy array
npimg = img.numpy()
# convert to H*W*C shape
npimg_tr = np.transpose(npimg, (1, 2, 0))
if color==False:
npimg_tr = npimg_tr[:,:,0]
plt.imshow(npimg_tr , interpolation="nearest", cmap="gray")
else:
plt.imshow(npimg_tr, interpolation="nearest")
plt.title("label: " + str(y))
plt.show()
# create a grid of sample images:
grid_size = 4
rnd_inds = np.random.randint(0, len(train_ds), grid_size)
print("image indices: ", rnd_inds)
x_grid_train = [train_ds[i][0] for i in rnd_inds]
y_grid_train = [train_ds[i][1] for i in rnd_inds]
x_grid_train = utils.make_grid(x_grid_train, nrow=4, padding=2)
print(x_grid_train.shape)
# display the grid
plt.rcParams["figure.figsize"] = (10.0, 5)
show(x_grid_train, y_grid_train)
# image indices: [43567 173685 117852 152315]
# torch.Size([3, 100, 394])
- 显示val_ds中的一些样例
grid_size = 4
rnd_inds = np.random.randint(0, len(val_ds), grid_size)
print("image indices:", rnd_inds)
x_grid_val = [val_ds[i][0] for i in range(grid_size)]
y_grid_val = [val_ds[i][1] for i in range(grid_size)]
x_grid_val = utils.make_grid(x_grid_val, nrow=4, padding=2)
print(x_grid_val.shape)
show(x_grid_val, y_grid_val)
# image indices: [30112 23456 121345 45673]
# torch.Size([3, 99, 393])
数据变换与数据增强
图像变换和图像增强是深度学习模型训练所必须的。通过使用图像变换,我们可以扩展我们的数据集并且通过规范化以实现更好的模型性能。典型的变换包括水平和垂直翻转、旋转和调整大小。我们可以对二分类模型使用各种图像变换,而不需要更改标签。例如,如果我们旋转或翻转一个恶性肿瘤图像,它仍然是恶性肿瘤。在本教程中,您将学习如何使用火炬视觉包在训练期间执行实时图像转换。
- 首先,为训练集定义如下变换函数
train_transformer = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomVerticalFlip(p=0.5),
transforms.RandomRotation(45),
transforms.RandomResizedCrop(96, scale=(0.8, 1.0), ratio=(1.0, 1.0)),
transforms.ToTensor()
])
- 对于验证集,我们不需要任何数据增强。所以我们只需要把图像转换为tensors
val_transformer = transforms.Compose([transforms.ToTensor()])
- 更新train_ds和val_ds变换函数
# overwrite the transform functions
train_ds.transform = train_transformer
val_ds.transform = val_transformer
创建dataloaders
我们准备创建一个PyTorch数据加载器。如果我们不使用数据加载器,我们必须编写代码来循环数据集并提取一批数据。这个过程可以使用PyTorch数据加载器自动完成。
- 首先,分别定义训练集和验证集的数据加载器
from torch.utils.data import DataLoader
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)
- 然后,从训练集的数据加载器中获取一批数据
# extract a batch from training data
for x, y in train_dl:
print(x.shape)
print(y.shape)
break
# torch.Size([32, 3, 96, 96])
# torch.Size([32])
- 最后,从验证集的数据加载器中获取一批数据
# extract a batch from val data
for x, y in val_dl:
print(x.shape)
print(y.shape)
break
# torch.Size([32, 3, 96, 96])
# torch.Size([32])
构建分类模型
我们将定义一个模型,把模型移到GPU设备上,并且获得模型概要。
- 为验证集创建baseline
# get labels for validation dataset
y_val = [ y for _, y in val_ds]
def accuracy(labels, out):
return np.sum(out==labels)/float(len(labels))
# accuracy all zeros predictions
acc_all_zeros = accuracy(y_val, np.zeros_like(y_val))
print("accuracy all zero prediction: %.2f" % acc_all_zeros)
# accuracy all zero prediction: 0.60
# accuracy all ones predictions
acc_all_ones = accuracy(y_val, np.ones_like(y_val))
print("accuracy all one prediction: %.2f" % acc_all_ones )
# accuracy all one prediction: 0.40
# accuracy random predictions
acc_random = accuracy(y_val, np.random.randint(2, size=len(y_val)))
print("accuracy random prediction:%.2f"%acc_random)
# accuracy random prediction: 0.50
- 构建函数实现计算CNN layer的输出尺寸的功能
import torch.nn as nn
import numpy as np
def findConv2dOutShape(H_in, W_in, conv, pool=2):
# get conv arguments
kernel_size = conv.kernel_size
stride = conv.stride
padding = conv.padding
dilation = conv.dilation
H_out = np.floor((H_in+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0]+1)
W_out = np.floor((H_in+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1]+1)
if pool:
H_out/=pool
W_out/=pool
return int(H_out), int(W_out)
# for example
conv1 = nn.Conv2d(3, 8, kernel_size=3)
h,w = findConv2dOutShape(96, 96, conv1)
print(h, w)
# 47 47
- 下一步,我们实现CNN模型
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self, params):
super(Net, self).__init__()
C_in,H_in,W_in = params["input_shape"]
init_f = params["initial_filters"]
num_fc1 = params["num_fc1"]
num_classes = params["num_classes"]
self.dropout_rate = params["dropout_rate"]
self.conv1 = nn.Conv2d(C_in, init_f, kernel_size=3)
h,w = findConv2dOutShape(H_in, W_in, self.conv1)
self.conv2 = nn.Conv2d(init_f, 2*init_f, kernel_size=3)
h,w=findConv2dOutShape(h,w,self.conv2)
self.conv3 = nn.Conv2d(2*init_f, 4*init_f, kernel_size=3)
h,w=findConv2dOutShape(h,w,self.conv3)
self.conv4 = nn.Conv2d(4*init_f, 8*init_f, kernel_size=3)
h,w=findConv2dOutShape(h, w, self.conv4)
# compute the flatten size
self.num_flatten = h*w*8*init_f
self.fc1 = nn.Linear(self.num_flatten, num_fc1)
self.fc2 = nn.Linear(num_fc1, num_classes)
def forward(self, x):
x = F.relu(self.conv1(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv2(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv3(x))
x = F.max_pool2d(x, 2, 2)
x = F.relu(self.conv4(x))
x = F.max_pool2d(x, 2, 2)
x = x.view(-1, self.num_flatten)
x = F.relu(self.fc1(x))
x = F.dropout(x, self.dropout_rate, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=1)
- 创建一个Net对象
# dict to define model parameters
params_model = {"input_shape": (3, 96, 96),
"initial_filters": 8,
"num_fc1": 100,
"dropout_rate": 0.25,
"num_classes": 2
}
# create model
cnn_model = Net(params_model)
- 把模型移到GPU上
# move model to cuda/gpu device
if torch.cuda.is_available():
device = torch.device("cuda")
cnn_model = cnn_model.to(device)
- 打印模型
print(cnn_model)
- 验证模型设备
print(next(cnn_model.parameters()).device)
# cuda:0
- 获取模型摘要
from torchsummary import summary
summary(cnn_model, input_size=(3, 96, 96), device=device.type)
定义损失函数
分类任务的标准损失函数是交叉熵损失或logloss。但是,在定义损失函数时,我们需要考虑模型输出及其激活函数。对于二分类任务,我们可以选择一个或两个输出。下表为不同激活函数对应的损失函数:
推荐使用log_softmax函数,因为它更容易扩展到多分类。由于数值稳定性和速度,PyTorch将log和softmax操作合并到一个函数中。
# 首先定义损失函数
loss_func = nn.NLLLoss(reduction="sum")
# 简单样例
# fix random seed
torch.manual_seed(0)
n,c = 8,2
y = torch.randn(n, c, requires_grad=True)
ls_F = nn.LogSoftmax(dim=1)
y_out = ls_F(y)
print(y_out.shape)
target = torch.randint(c, size=(n,))
print(target.shape)
loss = loss_func(y_out, target)
print(loss.item())
# torch.Size([8, 2])
# torch.Size([8])
# 5.266995429992676
# 然后计算损失相对于y的梯度
loss.backward()
print(y.data)
# tensor([[-1.1258, -1.1524],
# [-0.2506, -0.4339],
# [ 0.8487, 0.6920],
# [-0.3160, -2.1152],
# [ 0.3223, -1.2633],
# [ 0.3500, 0.3081],
# [ 0.1198, 1.2377],
# [ 1.1168, -0.2473]])
定义优化器
torch.optim包提供了通用优化器的实现。优化器将保存当前状态,并根据计算出的梯度更新参数。对于二分类任务,SGD和Adam优化器使用最多。torch.optim包中另一个有用的工具就是(learning schedule)学习计划。学习计划(learning schedule)是在训练过程中自动调整学习率以提高模型性能的有效工具。
这里,将会定义一个优化器,获取当前学习率并且定义一个学习计划。
# 1.首先定义一个学习率为3e-4的Adam优化器对象
from torch import optim
opt = optim.Adam(cnn_model.parameters(), lr=3e-4)
# 2.获取当前的学习率
def get_lr(opt):
for param_group in opt.param_groups:
return param_group['lr']
current_lr = get_lr(opt)
print("current lr={}".format(current_lr))
# current lr=0.0003
# 3.定义ReduceLROnPlateau学习计划
from torch.optim.lr_scheduler import ReduceLROnPlateau
# define learning rate schedule
# mode参数定义了度量量在训练期间是增加还是减少。例如,如果我们监视loss值,我们设置mode='min'。
# 如果我们监控accuracy,我们应该设置mode='max'。
lr_scheduler = ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=20, verbose=1)
# 4.我们将使用下面的例子学习学习速率计划是如何工作的
for i in range(100):
lr_scheduler.step(1)
# Epoch 21: reducing learning rate of group 0 to 1.5000e-04.
# Epoch 42: reducing learning rate of group 0 to 7.5000e-05.
# Epoch 63: reducing learning rate of group 0 to 3.7500e-05.
# Epoch 84: reducing learning rate of group 0 to 1.8750e-05.
模型训练与评估
到目前为止,我们已经创建了数据集,建立了模型,并定义了损失函数和优化器。在本教程中,我们将实现训练和验证脚本。训练和验证脚本可能很长且有重复的内容。为了更好的代码可读性和避免代码重复,我们将首先构建几个函数。
# 1.首先实现计算小批量准确个数的函数
def metrics_batch(output, target):
pred = output.argmax(dim=1, keepdim=True)
corrects = pred.eq(target.view_as(pred)).sum().item()
return corrects
# 2.然后实现计算小批量损失的函数
def loss_batch(loss_func, output, target, opt=None):
loss = loss_func(output, target)
with torch.no_grad():
metric_b = metrics_batch(output, target)
if opt is not None:
opt.zero_grad()
loss.backward()
opt.step()
return loss.item(), metric_b
# 3.下一步实现计算每个epoch的损失值和性能
def loss_epoch(model, loss_func, dataset_dl, sanity_check=False,opt=None):
running_loss=0.0
running_metric=0.0
len_data=len(dataset_dl.dataset)
for xb, yb in dataset_dl:
# move batch to device
xb = xb.to(device)
yb = yb.to(device)
# get model output
output = model(xb)
# get loss per batch
loss_b,metric_b=loss_batch(loss_func,output,yb,opt)
# update running loss
running_loss += loss_b
# update running metric
if metric_b is not None:
running_metric+=metric_b
# break the loop in case of sanity check
if sanity_check is True:
break
# average loss value
loss=running_loss/float(len_data)
# average metric value
metric=running_metric/float(len_data)
return loss,metric
# 4.实现train_val函数
def train_val(model, params):
# extract model params
num_epochs = params["num_epochs"]
loss_func = params["loss_func"]
opt=params["optimizer"]
train_dl=params["train_dl"]
val_dl=params["val_dl"]
sanity_check=params["sanity_check"]
lr_scheduler=params["lr_scheduler"]
path2weights=params["path2weights"]
# history of loss values in each epoch
loss_history = {
"train":[],
"val":[]
}
# history of metric values in each epoch
metric_history={
"train":[],
"val":[]
}
# a deep copy of weights for the best performing model
best_model_wts = copy.deepcopy(model.state_dict())
# initilaize best loss to a large value
best_loss = float("inf")
# main loop
for epoch in range(num_epochs):
# get current learning rate
current_lr = get_lr(opt)
print("Epoch {}/{}, current lr={}".format(epoch, num_epochs-1, current_lr))
# train model on training dataset
model.train()
train_loss, train_metric=loss_epoch(model, loss_func, train_dl, sanity_check, opt)
# collect loss and metric for training dataset
loss_history["train"].append(train_loss)
metric_history["train"].append(train_metric)
# evaluate model on validation dataset
model.eval()
with torch.no_grad():
val_loss, val_metric=loss_epoch(model, loss_func, val_dl, sanity_check)
# collect loss and metric for validation dataset
loss_history["val"].append(val_loss)
metric_history["val"].append(val_metric)
# store the best weights
if val_loss < best_loss:
best_loss = val_loss
best_model_wts = copy.deepcopy(model.state_dict())
# store weights into a local file
torch.save(model.state_dict(), path2weights)
print("Copied best model weights!")
# learning rate scheduler 监视验证集损失函数
lr_scheduler.step(val_loss)
# 每次学习率降低,从最好的权重继续训练
if current_lr != get_lr(opt):
print("Loading best model weights")
model.load_state_dict(best_model_wts)
print("train loss: %.6f, dev loss: %.6f, accuracy: %.2f" %(train_loss, val_loss,100*val_metric))
print("-"*10)
# load best model weigths
model.load_state_dict(best_model_wts)
return model, loss_history, metric_history
#5.设置sanity_check为True并运行代码
import copy
loss_func = nn.NLLLoss(reduction="sum")
opt=optim.Adam(cnn_model.parameters(), lr=3e-4)
lr_scheduler = ReduceLROnPlateau(opt, mode="min", factor=0.5, patience=20, verbose=1)
# define traing parameters and call train_val function
params_train={
"num_epochs": 100,
"optimizer": opt,
"loss_func" loss_func,
"train_dl": train_dl,
"val_dl": val_dl,
"sanity_check": True,
"lr_scheduler": lr_scheduler,
"path2weights": "./models/weights.pt",
}
# train and validate the model
cnn_model, loss_hist, metric_hist = train_val(cnn_model, params_train)
# Epoch 0/99, current lr=0.0003
# Copied best model weights!
# train loss: 0.000129, dev loss: 0.001024, accuracy: 0.05
# ----------
# Epoch 1/99, current lr=0.0003
# Copied best model weights!
# train loss: 0.000125, dev loss: 0.001021, accuracy: 0.05
# ...
#6. plot loss_his and metric_hist
# train-validation progress
num_epochs = params_train["num_epochs"]
# plot loss progress
plt.title("Train-Val Loss")
plt.plot(range(1, num_epochs+1), loss_hist["train"], label="train")
plt.plot(range(1, num_epochs+1), loss_hist["val"], label="val")
plt.ylabel("Loss")
plt.xlabel("Training Epochs")
plt.legend()
plt.show()
# plot accuracy progress
plt.title("Train-Val Accuracy")
plt.plot(range(1, num_epochs+1), metric_hist["train"], label="train")
plt.plot(range(1, num_epochs+1), metric_hist["val"], label="val")
plt.ylabel("Accuracy")
plt.xlabel("Training Epochs")
plt.legend()
plt.grid()
plt.show()
#7. 已经确信代码都正确,设置sanity_check:False并且运行代码:
# define traing parameters and call train_val function
params_train={
"num_epochs": 100,
"optimizer": opt,
"loss_func": loss_func,
"train_dl": train_dl,
"val_dl": val_dl,
"sanity_check": False,
"lr_scheduler": lr_scheduler,
"path2weights": "./models/weights.pt",
}
# train and validate the model
cnn_model, loss_hist, metric_hist = train_val(cnn_model, params_train)
模型部署
定义模型,导入权重并且部署模型
- 首先,创建Net对象,导入模型权重
# 模型参数
params_model = {
"input_shape": (3, 96, 96),
"initial_filters": 8,
"num_fc1": 100,
"dropout_rate": 0.25,
"num_classes": 2,
}
# 初始化模型
cnn_model = Net(params_model)
- 导入模型权重
# load sate_dict into model
path2weights="./models/weights.pt"
cnn_model.load_state_dict(torch.load(path2weights))
- 设置模型为eval模式
# set model in evaluation mode
cnn_model.eval()
- 模型移到GPU上
# move model to cuda/gpu device
if torch.cuda.is_available():
device = torch.device("cuda")
cnn_model=cnn_model.to(device)
- 实现deploy_model函数
def deploy_model(model, dataset, device,num_classes=2,sanity_check=False):
len_data = len(dataset)
# initialize output tensor on CPU:due to GPU memory limits
y_out=torch.zeros(len_data, num_classes)
# initialize ground truth on CPU:due to GPU memory limits
y_gt=np.zeros((len_data), dtype="uint8")
# move model to device
model = model.to(device)
elapsed_times=[]
with torch.no_grad():
for i in range(len_data):
x,y=dataset[i]
y_gt[i]=y
start=time.time()
y_out[i]=model(x.unsqueeze(0).to(device))
elapsed=time.time()-start
elapsed_times.append(elapsed)
if sanity_check is True:
break
inference_time = np.mean(elapsed_times)*1000
print("average inference time per image on %s:%.2f ms"%(device, inference_time))
return y_out.numpy(), y_gt
- 在验证集上部署模型
y_out, y_gt = deploy_model(cnn_model, val_ds, device=device, sanity_check=False)
print(y_out.shape, y_gt.shape)
# average inference time per image on cuda:0: 0.74ms (44005, 2)
# (44005,)
# (44005, 2) (44005,)
- 计算模型在验证集上的准确率
from sklearn.metrics import accuracy_score
# get predictions
y_pred = np.argmax(y_out, axis=1)
print(y_pred.shape, y_gt.shape)
# compute accuracy
acc = accuracy_score(y_pred, y_gt)
print("accuracy: %.2f" % acc)
# (44005,) (44005,)
# accuracy:0.91
- 计算在CPU上的推理时间
device_cpu = torch.device("cpu")
y_out,y_gt = deploy_model(cnn_model, val_ds, device=device_cpu, sanity_check=False)
print(y_out.shape, y_gt.shape)
average inference time per image on cpu: 2.21ms
# (44005, 2) (44005,)
在测试集上进行模型推理
# 1. 首先,导入test_labels.csv并打印头部
path2csv = "./data/test_labels.csv"
labels_df = pd.read_csv(path2csv)
labels_df.head()
# 2.创建测试集的datset对象
histo_test = histoCancerDataset(data_path, val_transformer, data_type="test")
print(len(histo_test))
# 57458
#3.在测试集上进行模型推理
y_test_out, _ = deploy_model(cnn_model, histo_test,device,sanity_check=False)
# average inference time per image on cuda:0: 0.74 ms
y_test_pred=np.argmax(y_test_out, axis=1)
print(y_test_pred.shape)
# (57458,)
#4. 显示预测结果
grid_size=4
rnd_inds = np.random.randint(0, len(histo_test), grid_size)
print("image indices:", rnd_inds)
x_grid_test = [histo_test[i][0] for i in range(grid_size)]
y_grid_test = [y_test_pred[i] for i in range(grid_size)]
x_grid_test = utils.make_grid(x_grid_test, nrow=4, padding=2)
print(x_grid_test.shape)
plt.rcParams["figure.figsize"]=(10.0, 5)
show(x_grid_test, y_grid_test)
# image indices: [2732 43567 43567 12346]
# torch.Size([3, 100, 394])
创建一个提交文件
print(y_test_out.shape)
cancer_preds = np.exp(y_test_out[:,1])
print(cancer_preds.shape)
# (57458, 2)
# (57458,)
# 将概率形式的预测结果转换为DataFrame格式并且存储在CSV文件中
path2sampleSub = "./data/" + "sample_submission.csv"
sample_df = pd.read_csv(path2sampleSub)
ids_list = list(sample_df.id)
pred_list = [p for p in cancer_preds]
pred_dict = dict((key[:-4], value) for (key, value) in zip(histo_test.filenames, pred_list))
pred_list_sub=[pred_dic[id_] for id_ in ids_list]
submission_df = pd.DataFrame({"id":ids_list, "label":pred_list_sub})
if not os.path.exists("./submissions/"):
os.makedirs("submissions/")
print("submission folder created!")
path2submission="./submissions/submission.csv"
submission_df.to_csv(path2submission, header=True, index=False)
submission_df.head()
你可以将CSV文件提交到Histopathologic Cancer Detection competition(已结束)