yolov1-
网络输出数据格式和标签的格式如下图所示:
2008_000082-label:
14 0.637 0.650666666667 0.13 0.250666666667
14 0.328 0.612 0.084 0.146666666667
3 0.163 0.576 0.234 0.16
13 0.474 0.8 0.436 0.325333333333
其中
14 0.637 0.650666666667 0.13 0.250666666667
=
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1(置信度)
0.637(x) 0.650666666667(y) 0.13(w) 0.250666666667(h)
batch=32
subdivisions=8
height=448
width=448
channels=3
cells=7*7
boxs_num=220+1+4=25=class+cond+xywh
y的本源追溯:
float train_network(network net, data d)
{
int batch = net.batch;
int n = d.X.rows / batch;
float *X = calloc(batch*d.X.cols, sizeof(float));
float *y = calloc(batch*d.y.cols, sizeof(float));
int i;
float sum = 0;
for(i = 0; i < n; ++i){
get_next_batch(d, batch, i*batch, X, y);
float err = train_network_datum(net, X, y);
sum += err;
}
free(X);
free(y);
return (float)sum/(n*batch);
}
函数中:
d.X.rows=32
d.X.cols=602112=448*448*3
d.Y.rows=32
d.Y.cols=1225=25*49
*Y=4*1225
*X=4*602112
batch=4
n=8
void get_next_batch(data d, int n, int offset, float *X, float *y)
{
int j;
for(j = 0; j < n; ++j){
int index = offset + j;
memcpy(X+j*d.X.cols, d.X.vals[index], d.X.cols*sizeof(float));
memcpy(y+j*d.y.cols, d.y.vals[index], d.y.cols*sizeof(float));
}
}
void *memcpy(void *str1, const void *str2, size_t n)
参数
str1 -- 指向用于存储复制内容的目标数组,类型强制转换为 void* 指针。
str2 -- 指向要复制的数据源,类型强制转换为 void* 指针。
n -- 要被复制的字节数
d.X.rows=32
d.X.cols=602112=448*448*3
d.Y.rows=32
d.Y.cols=1225=25*49
date d的来源,是在args.d
buffer又从哪里来呢?最初的buffer是从load_data_in_thread(args);
这个函数中获得的,我们来剖析下该函数
//data.c
pthread_t load_data_in_thread(load_args args)
{
pthread_t thread;
struct load_args *ptr = calloc(1, sizeof(struct load_args));
*ptr = args;
//调用load_thread这个函数
if(pthread_create(&thread, 0, load_thread, ptr)) error("Thread creation failed");
return thread;
}
//data.c
void *load_thread(void *ptr)
{
//printf("Loading data: %d\n", rand());
load_args a = *(struct load_args*)ptr;
if(a.exposure == 0) a.exposure = 1;
if(a.saturation == 0) a.saturation = 1;
if(a.aspect == 0) a.aspect = 1;
if (a.type == OLD_CLASSIFICATION_DATA){
*a.d = load_data_old(a.paths, a.n, a.m, a.labels, a.classes, a.w, a.h);
} else if (a.type == CLASSIFICATION_DATA){
*a.d = load_data_augment(a.paths, a.n, a.m, a.labels, a.classes, a.hierarchy, a.min, a.max, a.size, a.angle, a.aspect, a.hue, a.saturation, a.exposure);
} else if (a.type == SUPER_DATA){
*a.d = load_data_super(a.paths, a.n, a.m, a.w, a.h, a.scale);
} else if (a.type == WRITING_DATA){
*a.d = load_data_writing(a.paths, a.n, a.m, a.w, a.h, a.out_w, a.out_h);
} else if (a.type == REGION_DATA){
//因为a.type == REGION_DATA,所以调用这个函数,我们继续追~
*a.d = load_data_region(a.n, a.paths, a.m, a.w, a.h, a.num_boxes, a.classes, a.jitter, a.hue, a.saturation, a.exposure);
.
.
//data.c
data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter, float hue, float saturation, float exposure)
{
char **random_paths = get_random_paths(paths, n, m);
int i;
data d = {0};
d.shallow = 0;
//n就是batch size啦
d.X.rows = n;
//给X(也就是图像数据)分配内存
d.X.vals = calloc(d.X.rows, sizeof(float*));
d.X.cols = h*w*3;
int k = size*size*(5+classes);
//终于找到你啦~\(≧▽≦)/~。这里先给y分配了内存,注意一共分配了n*k个float类型的内存块,为什么分配这么多呢?慢慢往下看~
d.y = make_matrix(n, k);
for(i = 0; i < n; ++i){
//读取图像
image orig = load_image_color(random_paths[i], 0, 0);
int oh = orig.h;
int ow = orig.w;
//这里jitter=0.2(cfg文件中有写),这就是所谓的抖动了,其实就是crop(数据增广的一种)
//剪掉的不能太多,这里设置图像的左边和右边最多剪掉dw(整幅图像宽度的1/5),上边和下边最多剪掉dh(整幅图像高度的1/5)
int dw = (ow*jitter);
int dh = (oh*jitter);
//rand_uniform生成(-dw, dw)的一个随机数
int pleft = rand_uniform(-dw, dw);
int pright = rand_uniform(-dw, dw);
int ptop = rand_uniform(-dh, dh);
int pbot = rand_uniform(-dh, dh);
//swidth是图像剪完后的宽度,sheight是图像剪完后的高度
int swidth = ow - pleft - pright;
int sheight = oh - ptop - pbot;
//sx是图像剪完后宽度和原始图像的宽度比,同理sy
float sx = (float)swidth / ow;
float sy = (float)sheight / oh;
//设置图像随机翻转
int flip = rand()%2;
//开始剪切图像,咔咔咔,具体代码不看了,很简单~
image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
//dx=pleft/swidth,dy=ptop/sheight
float dx = ((float)pleft/ow)/sx;
float dy = ((float)ptop /oh)/sy;
//都剪完了,当然要把图像重新resize到448*448(论文中说了,输入图像是448*448)
image sized = resize_image(cropped, w, h);
//翻转图像~
if(flip) flip_image(sized);
//图像随机排序
random_distort_image(sized, hue, saturation, exposure);
//最终d.X.vals[]存储的就是要输入的数据啦,准备好X了,我们去准备下y
d.X.vals[i] = sized.data;
//开始追y,追追追~
fill_truth_region(random_paths[i], d.y.vals[i], classes, size, flip, dx, dy, 1./sx, 1./sy);
data load_data_region(int n, char **paths, int m, int w, int h, int size, int classes, float jitter)
{
char **random_paths = get_random_paths(paths, n, m);
int i;
data d = {0};
d.shallow = 0;
d.X.rows = n;
d.X.vals = calloc(d.X.rows, sizeof(float*));
d.X.cols = h*w*3;
int k = size*size*(5+classes);
d.y = make_matrix(n, k);
for(i = 0; i < n; ++i){
image orig = load_image_color(random_paths[i], 0, 0);
int oh = orig.h;
int ow = orig.w;
int dw = (ow*jitter);
int dh = (oh*jitter);
int pleft = rand_uniform(-dw, dw);
int pright = rand_uniform(-dw, dw);
int ptop = rand_uniform(-dh, dh);
int pbot = rand_uniform(-dh, dh);
int swidth = ow - pleft - pright;
int sheight = oh - ptop - pbot;
float sx = (float)swidth / ow;
float sy = (float)sheight / oh;
int flip = rand_r(&data_seed)%2;
image cropped = crop_image(orig, pleft, ptop, swidth, sheight);
float dx = ((float)pleft/ow)/sx;
float dy = ((float)ptop /oh)/sy;
image sized = resize_image(cropped, w, h);
if(flip) flip_image(sized);
d.X.vals[i] = sized.data;
fill_truth_region(random_paths[i], d.y.vals[i], classes, size, flip, dx, dy, 1./sx, 1./sy);
free_image(orig);
free_image(cropped);
}
free(random_paths);
return d;
}
image crop_image(image im, int dx, int dy, int w, int h)
{
image cropped = make_image(w, h, im.c);
int i, j, k;
for(k = 0; k < im.c; ++k){
for(j = 0; j < h; ++j){
for(i = 0; i < w; ++i){
int r = j + dy;
int c = i + dx;
float val = 0;
r = constrain_int(r, 0, im.h-1);
c = constrain_int(c, 0, im.w-1);
if (r >= 0 && r < im.h && c >= 0 && c < im.w) {
val = get_pixel(im, c, r, k);
}
set_pixel(cropped, i, j, k, val);
}
}
}
return cropped;
}
//data.c
void fill_truth_region(char *path, float *truth, int classes, int num_boxes, int flip, float dx, float dy, float sx, float sy)
{
char labelpath[4096];
//有人一直不知道labels怎么来的,说源码都没设置labels的路径啊,怎么读的labels啊,那不是成了无监督学习?其实源码只是没直接设置labels的路径而已,把images替换为labels,在把.jpg替换为.txt,labels的路径就有了~
find_replace(path, "images", "labels", labelpath);
find_replace(labelpath, "JPEGImages", "labels", labelpath);
find_replace(labelpath, ".jpg", ".txt", labelpath);
find_replace(labelpath, ".png", ".txt", labelpath);
find_replace(labelpath, ".JPG", ".txt", labelpath);
find_replace(labelpath, ".JPEG", ".txt", labelpath);
int count = 0;
//从.txt中读取labels值,count记录框的个数
box_label *boxes = read_boxes(labelpath, &count);
//把框随机排序~
randomize_boxes(boxes, count);
//因为图像已经被修剪了,所以框的坐标也要改一改,correct_boxes函数就是把框在原始图像下的坐标转到修剪后图像下的坐标
correct_boxes(boxes, count, dx, dy, sx, sy, flip);
float x,y,w,h;
int id;
int i;
for (i = 0; i < count; ++i) {
x = boxes[i].x;
y = boxes[i].y;
w = boxes[i].w;
h = boxes[i].h;
id = boxes[i].id;
//修剪后,太小的框就不作为正样本了
if (w < .01 || h < .01) continue;
//这里x的值为0~1之间(不一定能取到0和1,因为图像被修剪过了,坐标的范围也变了),num_boxes=7,所以col和row都是0~6之间的整数
int col = (int)(x*num_boxes);
int row = (int)(y*num_boxes);
//x和y又被打回原型,又变成0~1之间的数了
x = x*num_boxes - col;
y = y*num_boxes - row;
//index就懂了吧,一共7*7个网格,每个网格的索引是0~6
int index = (col+row*num_boxes)*(5+classes);
if (truth[index]) continue;
//如果第i个框落在这个网格里,就把相应的置信度赋1
truth[index++] = 1;
//然后看标签id是几,就把对应的类别处赋值为1
if (id < classes) truth[index+id] = 1;
index += classes;
//再赋值框的x,y,w,h到truth
truth[index++] = x;
truth[index++] = y;
truth[index++] = w;
truth[index++] = h;
}
free(boxes);
}
void correct_boxes(box_label *boxes, int n, float dx, float dy, float sx, float sy, int flip)
{
int i;
for(i = 0; i < n; ++i){
boxes[i].left = boxes[i].left * sx - dx;
boxes[i].right = boxes[i].right * sx - dx;
boxes[i].top = boxes[i].top * sy - dy;
boxes[i].bottom = boxes[i].bottom* sy - dy;
if(flip){
float swap = boxes[i].left;
boxes[i].left = 1. - boxes[i].right;
boxes[i].right = 1. - swap;
}
boxes[i].left = constrain(0, 1, boxes[i].left);
boxes[i].right = constrain(0, 1, boxes[i].right);
boxes[i].top = constrain(0, 1, boxes[i].top);
boxes[i].bottom = constrain(0, 1, boxes[i].bottom);
boxes[i].x = (boxes[i].left+boxes[i].right)/2;
boxes[i].y = (boxes[i].top+boxes[i].bottom)/2;
boxes[i].w = (boxes[i].right - boxes[i].left);
boxes[i].h = (boxes[i].bottom - boxes[i].top);
boxes[i].w = constrain(0, 1, boxes[i].w);
boxes[i].h = constrain(0, 1, boxes[i].h);
}
}
float constrain(float min, float max, float a)
{
if (a < min) return min;
if (a > max) return max;
return a;
}
void forward_detection_layer(const detection_layer l, network_state state)
{
int locations = l.side*l.side;
int i,j;
memcpy(l.output, state.input, l.outputs*l.batch*sizeof(float));
int b;
if (l.softmax){
for(b = 0; b < l.batch; ++b){
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
int offset = i*l.classes;
softmax_array(l.output + index + offset, l.classes, 1,
l.output + index + offset);
}
}
}
if(state.train){
float avg_iou = 0;
float avg_cat = 0;
float avg_allcat = 0;
float avg_obj = 0;
float avg_anyobj = 0;
int count = 0;
*(l.cost) = 0;
int size = l.inputs * l.batch;
memset(l.delta, 0, size * sizeof(float));
for (b = 0; b < l.batch; ++b){
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
int truth_index = (b*locations + i)*(1+l.coords+l.classes);
int is_obj = state.truth[truth_index];
for (j = 0; j < l.n; ++j) {
int p_index = index + locations*l.classes + i*l.n + j;
l.delta[p_index] = l.noobject_scale*(0 - l.output[p_index]);
*(l.cost) += l.noobject_scale*pow(l.output[p_index], 2);
avg_anyobj += l.output[p_index];
}
int best_index = -1;
float best_iou = 0;
float best_rmse = 20;
if (!is_obj){
continue;
}
int class_index = index + i*l.classes;
for(j = 0; j < l.classes; ++j) {
l.delta[class_index+j] = l.class_scale * (state.truth[truth_index+1+j] - l.output[class_index+j]);
*(l.cost) += l.class_scale * pow(state.truth[truth_index+1+j] - l.output[class_index+j], 2);
if(state.truth[truth_index + 1 + j]) avg_cat += l.output[class_index+j];
avg_allcat += l.output[class_index+j];
}
box truth = float_to_box(state.truth + truth_index + 1 + l.classes);
truth.x /= l.side;
truth.y /= l.side;
for(j = 0; j < l.n; ++j){
int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
box out = float_to_box(l.output + box_index);
out.x /= l.side;
out.y /= l.side;
if (l.sqrt){
out.w = out.w*out.w;
out.h = out.h*out.h;
}
float iou = box_iou(out, truth);
//iou = 0;
float rmse = box_rmse(out, truth);
if(best_iou > 0 || iou > 0){
if(iou > best_iou){
best_iou = iou;
best_index = j;
}
}else{
if(rmse < best_rmse){
best_rmse = rmse;
best_index = j;
}
}
}
if(l.forced){
if(truth.w*truth.h < .1){
best_index = 1;
}else{
best_index = 0;
}
}
if(l.random && *(state.net.seen) < 64000){
best_index = rand()%l.n;
}
int box_index = index + locations*(l.classes + l.n) + (i*l.n + best_index) * l.coords;
int tbox_index = truth_index + 1 + l.classes;
box out = float_to_box(l.output + box_index);
out.x /= l.side;
out.y /= l.side;
if (l.sqrt) {
out.w = out.w*out.w;
out.h = out.h*out.h;
}
float iou = box_iou(out, truth);
//printf("%d,", best_index);
int p_index = index + locations*l.classes + i*l.n + best_index;
*(l.cost) -= l.noobject_scale * pow(l.output[p_index], 2);
*(l.cost) += l.object_scale * pow(1-l.output[p_index], 2);
avg_obj += l.output[p_index];
l.delta[p_index] = l.object_scale * (1.-l.output[p_index]);
if(l.rescore){
l.delta[p_index] = l.object_scale * (iou - l.output[p_index]);
}
l.delta[box_index+0] = l.coord_scale*(state.truth[tbox_index + 0] - l.output[box_index + 0]);
l.delta[box_index+1] = l.coord_scale*(state.truth[tbox_index + 1] - l.output[box_index + 1]);
l.delta[box_index+2] = l.coord_scale*(state.truth[tbox_index + 2] - l.output[box_index + 2]);
l.delta[box_index+3] = l.coord_scale*(state.truth[tbox_index + 3] - l.output[box_index + 3]);
if(l.sqrt){
l.delta[box_index+2] = l.coord_scale*(sqrt(state.truth[tbox_index + 2]) - l.output[box_index + 2]);
l.delta[box_index+3] = l.coord_scale*(sqrt(state.truth[tbox_index + 3]) - l.output[box_index + 3]);
}
*(l.cost) += pow(1-iou, 2);
avg_iou += iou;
++count;
}
}
if(0){
float *costs = calloc(l.batch*locations*l.n, sizeof(float));
for (b = 0; b < l.batch; ++b) {
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
for (j = 0; j < l.n; ++j) {
int p_index = index + locations*l.classes + i*l.n + j;
costs[b*locations*l.n + i*l.n + j] = l.delta[p_index]*l.delta[p_index];
}
}
}
int indexes[100];
top_k(costs, l.batch*locations*l.n, 100, indexes);
float cutoff = costs[indexes[99]];
for (b = 0; b < l.batch; ++b) {
int index = b*l.inputs;
for (i = 0; i < locations; ++i) {
for (j = 0; j < l.n; ++j) {
int p_index = index + locations*l.classes + i*l.n + j;
if (l.delta[p_index]*l.delta[p_index] < cutoff) l.delta[p_index] = 0;
}
}
}
free(costs);
}
*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
printf("Detection Avg IOU: %f, Pos Cat: %f, All Cat: %f, Pos Obj: %f, Any Obj: %f, count: %d\n", avg_iou/count, avg_cat/count, avg_allcat/(count*l.classes), avg_obj/count, avg_anyobj/(l.batch*locations*l.n), count);
}
}
这里参数意义如下:
locations:7*7
b :batch size的索引
i :locations的索引
1 :置信度
l.coords :值为4,分别表示x,y,w,h
l.classes : 20
for(j = 0; j < l.n; ++j)
{
int box_index = index + locations*(l.classes + l.n) + (i*l.n + j) * l.coords;
box out = float_to_box(l.output + box_index);
out.x /= l.side;
out.y /= l.side;
if (l.sqrt){
out.w = out.w*out.w;
out.h = out.h*out.h;
}
//计算iou的值
float iou = box_iou(out, truth);
//iou = 0;
//计算均方根误差(root-mean-square error)
float rmse = box_rmse(out, truth);
//选出iou最大或者均方根误差最小的那个框作为最后预测框
if(best_iou > 0 || iou > 0){
if(iou > best_iou){
best_iou = iou;
best_index = j;
}
}else{
if(rmse < best_rmse){
best_rmse = rmse;
best_index = j;
}
}
}
out(每个网格一共l.n个out,论文中l.n=2)就是网络回归出来的值,然后把out的值和truth中的值对应比较,计算出iou,然后从l.n
个iou中挑出iou最高的一个,作为最后的预测框,说白了就是:只有该框会对loss function产生影响,其他框不产生影响
-------------------------------------------------------
2008_000008.txt
12 0.524 0.573529411765 0.836 0.753393665158
14 0.447 0.238687782805 0.262 0.2782805429862009_003377.txt
15 0.049 0.202488687783 0.094 0.246606334842
17 0.501 0.641402714932 0.998 0.717194570136
2009_003377.txt
15 0.049 0.202488687783 0.094 0.246606334842
17 0.501 0.641402714932 0.998 0.717194570136
float constrain(float min, float max, float a)
{
if (a < min) return min;
if (a > max) return max;
return a;
}