YOLOv3源码阅读：get_kmeans.py

2019-05-22 2019-08-31

目标检测

10 minutes read (About 1498 words) 0 visits

一、YOLO简介

YOLO（You Only Look Once）是一个高效的目标检测算法，属于One-Stage大家族，针对于Two-Stage目标检测算法普遍存在的运算速度慢的缺点，YOLO创造性的提出了One-Stage。也就是将物体分类和物体定位在一个步骤中完成。YOLO直接在输出层回归bounding box的位置和bounding box所属类别，从而实现one-stage。

经过两次迭代，YOLO目前的最新版本为YOLOv3，在前两版的基础上，YOLOv3进行了一些比较细节的改动，效果有所提升。

本文正是希望可以将源码加以注释，方便自己学习，同时也愿意分享出来和大家一起学习。由于本人还是一学生，如果有错还请大家不吝指出。

本文参考的源码地址为：https://github.com/wizyoung/YOLOv3_TensorFlow

二、代码和注释

文件目录：YOUR_PATH\YOLOv3_TensorFlow-master\get_kmeans.py

这里函数的主要作用是使用kmeans聚类产生若干个anchors中心，在训练的时候使用这些作为一种先验条件。这里的聚类主要是对目标检测框的尺寸进行聚类。

# coding: utf-8
# This script is modified from https://github.com/lars76/kmeans-anchor-boxes

from __future__ import division, print_function

import numpy as np

# 计算IOU，box一个长度为2的数组，表示box的尺寸，clusters表示的是若干集群的中心，同样也是尺寸。
def iou(box, clusters):
    """
    Calculates the Intersection over Union (IoU) between a box and k clusters.
    param:
        box: tuple or array, shifted to the origin (i. e. width and height)
        clusters: numpy array of shape (k, 2) where k is the number of clusters
    return:
        numpy array of shape (k, 0) where k is the number of clusters
    """
    x = np.minimum(clusters[:, 0], box[0])
    y = np.minimum(clusters[:, 1], box[1])
    if np.count_nonzero(x == 0) > 0 or np.count_nonzero(y == 0) > 0:
        raise ValueError("Box has no area")

    intersection = x * y
    box_area = box[0] * box[1]
    cluster_area = clusters[:, 0] * clusters[:, 1]

    iou_ = intersection / (box_area + cluster_area - intersection + 1e-10)

    return iou_


def avg_iou(boxes, clusters):
    """
    Calculates the average Intersection over Union (IoU) between a numpy array of boxes and k clusters.
    param:
        boxes: numpy array of shape (r, 2), where r is the number of rows
        clusters: numpy array of shape (k, 2) where k is the number of clusters
    return:
        average IoU as a single float
    """
    # 计算平均IOU
    return np.mean([np.max(iou(boxes[i], clusters)) for i in range(boxes.shape[0])])


# 这个函数并未在任何地方被使用
def translate_boxes(boxes):
    """
    Translates all the boxes to the origin.
    param:
        boxes: numpy array of shape (r, 4)
    return:
    numpy array of shape (r, 2)
    """
    new_boxes = boxes.copy()
    for row in range(new_boxes.shape[0]):
        new_boxes[row][2] = np.abs(new_boxes[row][2] - new_boxes[row][0])
        new_boxes[row][3] = np.abs(new_boxes[row][3] - new_boxes[row][1])
    return np.delete(new_boxes, [0, 1], axis=1)


def kmeans(boxes, k, dist=np.median):
    """
    Calculates k-means clustering with the Intersection over Union (IoU) metric.
    param:
        boxes: numpy array of shape (r, 2), where r is the number of rows
        k: number of clusters
        dist: distance function
    return:
        numpy array of shape (k, 2)
    """
    # rows表示的是数据集中一共有多少个标注框
    rows = boxes.shape[0]

    # 初始化统计距离的矩阵和每一个标注框的所属集群编号，
    # 这里使用last cluster记录下一轮循环开始时标注框的集群编号，如果在这某一轮的迭代中不发生改变则算法已经收敛。
    distances = np.empty((rows, k))
    last_clusters = np.zeros((rows,))

    np.random.seed()

    # the Forgy method will fail if the whole array contains the same rows
    # 随机选择几个数据作为初始的集群中心
    clusters = boxes[np.random.choice(rows, k, replace=False)]

    # 循环
    while True:
        # 对每一个标注框,计算其与每个集群中心的距离,这里的距离采用的是(1 - 标注框与集群中心的IOU)来表示,
        # IOU数值越大, 则(1- IOU)越小， 则表示距离越接近.
        for row in range(rows):
            distances[row] = 1 - iou(boxes[row], clusters)

        # 对每个标注框选择与其距离最接近的集群中心的标号作为所属类别的编号。
        nearest_clusters = np.argmin(distances, axis=1)

        # 如果在这轮循环中所有的标注框的所属类别不再变化，则说明算法已经收敛，可以跳出循环。
        if (last_clusters == nearest_clusters).all():
            break

        # 对每一类集群，取出所有属于该集群的数据，并按照给定的方法计算集群的中心，
        # 这里默认采用中位数的方法来计算集群中心
        for cluster in range(k):
            clusters[cluster] = dist(boxes[nearest_clusters == cluster], axis=0)

        # 更新每一个标注框所属的集群类别。
        last_clusters = nearest_clusters

    # 返回所有的集群中心
    return clusters


def parse_anno(annotation_path):
    # 打开数据标记的文件
    anno = open(annotation_path, 'r')

    # 用以储存最后的提取出的所有的高度和宽度的结果，
    result = []

    # 对每一个标记图片
    for line in anno:
        # 根据空格将数据行进行分割
        s = line.strip().split(' ')

        # 按照数据的标记规则，每一行的第一个数据是编号，第二个数据是图片地址，从第三个开始才是标记框的信息。
        s = s[2:]

        # 当前图片的标记框的数目，每个标记框包含五个信息，四个坐标信息和一个类别信息
        box_cnt = len(s) // 5

        # 分别处理每一个标记框的信息，并提取标记框的高度和宽度，存入result 列表。
        for i in range(box_cnt):
            x_min, y_min, x_max, y_max = float(s[i*5+1]), float(s[i*5+2]), float(s[i*5+3]), float(s[i*5+4])
            width = x_max - x_min
            height = y_max - y_min
            assert width > 0
            assert height > 0
            result.append([width, height])

    # 将list变为numpy的数组
    result = np.asarray(result)

    # 返回
    return result


def get_kmeans(anno, cluster_num=9):

    # 使用kmeans算法计算需要的anchors
    anchors = kmeans(anno, cluster_num)

    # 计算平均IOU
    ave_iou = avg_iou(anno, anchors)

    # 格式化为int类型
    anchors = anchors.astype('int').tolist()

    # 按照面积大小排序，
    anchors = sorted(anchors, key=lambda x: x[0] * x[1])

    # 返回
    return anchors, ave_iou


if __name__ == '__main__':
    annotation_path = "./data/my_data/train.txt"
    anno_result = parse_anno(annotation_path)
    anchors, ave_iou = get_kmeans(anno_result, 9)

    # 格式化输出anchors数据
    anchor_string = ''
    for anchor in anchors:
        anchor_string += '{},{}, '.format(anchor[0], anchor[1])
    anchor_string = anchor_string[:-2]

    print('anchors are:')
    print(anchor_string)
    print('the average iou is:')
    print(ave_iou)