from scipy import ndimage # scipy.ndimage: Multi-dimentional image processing(多维图像处理包) 更强大的图像处理库包括:opencv, scikit-image等 from collections import Counter # collections模块包含多种集合类,Counter是其中之一,它是一个简单的计数器,统计字符出现的个数,是dict的一个子类 from core.vggnet import Vgg19 # core.vggnet.Vgg19类从imagenet-vgg-verydeep-19.mat中获取了预训练参数,用这些预训练参数构造了vgg19网络的计算模型。 from core.utils import * import tensorflow as tf import numpy as np import pandas as pd import hickle # hickle与pickle都是常用的序列化/反序列化模块,用来保存程序运行结果或者加载包含程序所需信息的文件。 import os # os模块用于程序与操作系统交互,访问文件夹 import json # json和hickle,pickle作用类似
以上是preprocess.py的import信息。因为该python文件包含多个python函数,内容较多,需要理清头绪,找出程序入口,程序入口即main()函数。
def main(): # batch size for extracting feature vectors from vggnet batch_size = 100 # 一次提取100幅图像的feature vectors # maximum length of caption (number of word). if caption is longer than max_length, deleted. max_length = 15 # 标签语句最长15个单词,超过15个单词的语句删掉 # if word occurs less than word_count_threshold in training dataset, the word index is special unknown token. word_count_threshold = 1 # 如果训练集中某个单词出现次数小于1,那就设为null(一个特殊的token) # vgg model path vgg_model_path = './data/imagenet-vgg-verydeep-19.mat' # about 80000 images and 400000 captions for train dataset train_dataset = _process_caption_data(caption_file='data/annotations/captions_train2014.json, image_dir='image/train2014_resized', max_length=max_length) # 有图像文件夹image_dir,有包含标签语句和图像与标签的连接信息的caption_file,这个函数(后面详细介绍)事实上构建了训练集变量,另外一点:./data/ == data/ # about 40000 images and 200000 captions val_dataset = _process_caption_data(caption_file='data/annotations/captions_val2014.json', image_dir='image/val_resized', max_length=max_length) # 这里构建了验证集变量 # about 4000 images and 20000 captions for val / test dataset val_cutoff = int(0.1 * len(val_dataset)) test_cutoff = int(0.2 * len(val_dataset)) print('Finished processing caption data') save_pickle(train_dataset, 'data/train/train.annotations.pkl') save_pickle(val_dataset[:val_cutoff], 'data/val/val.annotations.pkl') save_pickle(val_dataset[val_cutoff:test_cutoff].reset_index(drop=True), 'data/test/test.annotations.pkl') """ 这里save_pickle()函数与pickle模块有关,pickle模块保存的文件后缀名都是pkl,save_pickle()是对pickle.dump()函数的扩展,它的定义在core.utils模块中(前面导入模块中已经写了)。 reset_index()方法的全称是pandas.DataFrame.reset_index(),用来防止原索引变成数据列。可见_process_caption_data返回的结果是pd.DataFrame类的实例,但疑点是为什么前两个没用该方法? 从这儿开始,对上面得到的train, val, test三个文件,逐个执行相关操作。 """ for split in ['train', 'val', 'test']: annotations = load_pickle('./data/%s/%s.annotations.pkl' % (split, split)) # load_pickle()与save_pickle()情形相似,都位于core.utils模块中(core/utils.py文件中),都是对pickle模块中的函数进行扩展,不同之处在于load_pickle()扩展的是pickle.load() if split == 'train': word_to_idx = _build_vocab(annotations=annotations, threshold=word_count_threshold) # 在training阶段,制作词汇表,方便后续的one-hot词编码和词嵌入。 save_pickle(word_to_idx, '.data/%s/word_to_idx.pkl' % split) # 把词汇表保存起来 captions = _build_caption_vector(annotations=annotations, word_to_idx=word_to_idx, max_length=max_length) # 制作好词汇表后,对整个句子进行编码 save_pickle(captions, './data/%s/%s.captions.pkl' % (split, split)) # 对句子编码向量保存起来。 file_names, id_to_idx = _build_file_names(annotations) save_pickle(file_names, './data/%s/%s.file.names.pkl' % (split, split)) image_idxs = _build_image_idxs(annotations, id_to_idx) save_pickle(image_idxs, './data/%s/%s.image.idxs.pkl' % (split, split)) """这四句暂时不清楚具体干了啥,但应该是提取了图片文件名,图片id,标签语句,标签编码之间的关系""" # prepare reference captions to compute bleu scores later image_ids = {} feature_to_captions = {} i = -1 for caption, image_id in zip(annotations['caption'], annotations['image_id']): if not image_id in image_ids: image_ids[image_id] = 0 i += 1 feature_to_captions[i] = [] feature_to_captions[i].append(caption.lower() + ' .') save_pickle(feature_to_captions, './data/%s/%s.references.pkl' % (split, split)) print("finished building %s caption dataset" % split)
# extract conv5_3 feature vectors vggnet = Vgg19(vgg_model_path) # 加载预训练的模型参数 vggnet.build() # 加载后构建vgg19模型,得到完整的计算流程 with tf.Session() as sess: tf.initialize_all_variables().run() for split in ['train', 'val', 'test']: anno_path = './data/%s/%s.annotations.pkl' % (split, split) save_path = './data/%s/%s.features.hkl' % (split, split) annotations = load_pickle(anno_path) image_path = list(annotations['file_name'].unique()) n_example = len(image_path) all_feats = np.ndarray([n_example, 196, 512], dtype=np.float32) for start, end in zip(range(0, n_example, batch_size), range(batch_size, n_example+batch_size, batch_size)): image_batch_file = image_path[staart:end] image_batch = np.array(map(lambda x: ndimage.imread(x, mode='RGB'), image_batch_file)).astype(np.float32) feats = sess.run(vggnet.features, feed_dict={vggnet.images:image_batch}) all_feats[start:end, :] = feats print("Processed %d %s features.." % (end, split)) # use hickle to save huge feature vectors hickle.dump(all_feats, save_path) print("Saved %s.." % (save_path))