文本检测中的nms

今天被问到了OCR相关的NMS,个人一直偏向于通用目标检测的NMS,正好补补课,扩展一下OCR方向的知识.

对通用目标检测或者人脸检测等得到的检测框都是矩形的,而文本检测又具有特殊性,会包含倾斜、弯曲,因此通用的NMS算法不是特别适合。目前主要有 locality-aware NMS,inclined NMS,mask NMS,polygonal NMS.

locality aware NMS

LNMS是旷视论文EAST中提出来的. 由于文本检测可能会检测出大量的候选box,此时使用标准NMS的话时间复杂度高。作者基于这样一个假设:靠的近的像素倾向于具有高的相关性。因此逐行将得到的几何体进行合并。LNMS一般用于轴对齐的矩形框(即水平bbox),特别是离得很近的倾斜文本.

  • 对输出的每个box根据IOU阈值进行合并
  • 对合并后的box进行标准NMS

    import numpy as np
    from shapely.geometry import Polygon
    
    
    def intersection(g, p):
    # 求 p,q面积的IOU
    g = Polygon(g[:8].reshape((4, 2)))
    p = Polygon(p[:8].reshape((4, 2)))
    if not g.is_valid or not p.is_valid:
        return 0
    inter = Polygon(g).intersection(Polygon(p)).area
    union = g.area + p.area - inter
    if union == 0:
        return 0
    else:
        return inter/union
    
    
    def weighted_merge(g, p):
    # 合并后的几何体的坐标是合并前两个几何体坐标的加权再除以权重和,权重为预测的得分
    g[:8] = (g[8] * g[:8] + p[8] * p[:8])/(g[8] + p[8])
    # 合并后的几何体的得分为两个几何体得分的总和
    g[8] = (g[8] + p[8])
    return g
    
    
    def standard_nms(S, thres):
    order = np.argsort(S[:, 8])[::-1]
    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
    
        inds = np.where(ovr <= thres)[0]
        order = order[inds+1]
    
    return S[keep]
    
    
    def nms_locality(polys, thres=0.3):
    '''
    locality aware nms of EAST
    :param polys: a N*9 numpy array. first 8 coordinates, then prob
    :return: boxes after nms
    '''
    S = []
    p = None
    for g in polys:
        if p is not None and intersection(g, p) > thres:
            p = weighted_merge(g, p)
        else:
            if p is not None:
                S.append(p)
            p = g
    if p is not None:
        S.append(p)
    
    if len(S) == 0:
        return np.array([])
    return standard_nms(np.array(S), thres)
    

inclined NMS(倾斜NMS)

对于带旋转角度的框,进行NMS时考虑角度

#coding=utf-8
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import cv2
import tensorflow as tf

def nms_rotate(decode_boxes, scores, iou_threshold, max_output_size,
               use_angle_condition=False, angle_threshold=0, use_gpu=False, gpu_id=0):
    """
    :param boxes: format [x_c, y_c, w, h, theta]
    :param scores: scores of boxes
    :param threshold: iou threshold (0.7 or 0.5)
    :param max_output_size: max number of output
    :return: the remaining index of boxes
    """
    if use_gpu:
        #采用gpu方式
        keep = nms_rotate_gpu(boxes_list=decode_boxes,
                              scores=scores,
                              iou_threshold=iou_threshold,
                              angle_gap_threshold=angle_threshold,
                              use_angle_condition=use_angle_condition,
                              device_id=gpu_id)

        keep = tf.cond(
            tf.greater(tf.shape(keep)[0], max_output_size),
            true_fn=lambda: tf.slice(keep, [0], [max_output_size]),
            false_fn=lambda: keep)
    else: #采用cpu方式
        keep = tf.py_func(nms_rotate_cpu,
                          inp=[decode_boxes, scores, iou_threshold, max_output_size],
                          Tout=tf.int64)
    return keep

def nms_rotate_cpu(boxes, scores, iou_threshold, max_output_size):
    keep = [] #保留框的结果集合
    order = scores.argsort()[::-1] #对检测结果得分进行降序排序
    num = boxes.shape[0] #获取检测框的个数

    suppressed = np.zeros((num), dtype=np.int)
    for _i in range(num):
        if len(keep) >= max_output_size:  #若当前保留框集合中的个数大于max_output_size时,直接返回
            break

        i = order[_i]
        if suppressed[i] == 1: #对于抑制的检测框直接跳过
            continue
        keep.append(i)  #保留当前框的索引
        r1 = ((boxes[i, 1], boxes[i, 0]), (boxes[i, 3], boxes[i, 2]), boxes[i, 4])  #根据box信息组合成opencv中的旋转bbox
        print("r1:{}".format(r1))
        area_r1 = boxes[i, 2] * boxes[i, 3]  #计算当前检测框的面积
        for _j in range(_i + 1, num):  #对剩余的而进行遍历
            j = order[_j]
            if suppressed[i] == 1:
                continue
            r2 = ((boxes[j, 1], boxes[j, 0]), (boxes[j, 3], boxes[j, 2]), boxes[j, 4])
            area_r2 = boxes[j, 2] * boxes[j, 3]
            inter = 0.0

            int_pts = cv2.rotatedRectangleIntersection(r1, r2)[1] #求两个旋转矩形的交集,并返回相交的点集合
            if int_pts is not None:
                order_pts = cv2.convexHull(int_pts, returnPoints=True) #求点集的凸边形
                int_area = cv2.contourArea(order_pts)  #计算当前点集合组成的凸边形的面积
                inter = int_area * 1.0 / (area_r1 + area_r2 - int_area + 0.0000001)

            if inter >= iou_threshold:  #对大于设定阈值的检测框进行滤除
                suppressed[j] = 1

    return np.array(keep, np.int64)

# gpu的实现方式
def nms_rotate_gpu(boxes_list, scores, iou_threshold, use_angle_condition=False, angle_gap_threshold=0, device_id=0):
    if use_angle_condition:
        y_c, x_c, h, w, theta = tf.unstack(boxes_list, axis=1)
        boxes_list = tf.transpose(tf.stack([x_c, y_c, w, h, theta]))
        det_tensor = tf.concat([boxes_list, tf.expand_dims(scores, axis=1)], axis=1)
        keep = tf.py_func(rotate_gpu_nms,
                          inp=[det_tensor, iou_threshold, device_id],
                          Tout=tf.int64)
        return keep
    else:
        y_c, x_c, h, w, theta = tf.unstack(boxes_list, axis=1)
        boxes_list = tf.transpose(tf.stack([x_c, y_c, w, h, theta]))
        det_tensor = tf.concat([boxes_list, tf.expand_dims(scores, axis=1)], axis=1)
        keep = tf.py_func(rotate_gpu_nms,
                          inp=[det_tensor, iou_threshold, device_id],
                          Tout=tf.int64)
        keep = tf.reshape(keep, [-1])
        return keep

Polygon NMS(多边形NMS)

来源于论文 Detecting Curve Text in the Wild: New Dataset and New Solution, 做曲线文本检测. 这篇论文中是检测14个点. 就是将标准NMS4个点换成14点的多边形.

def py_cpu_pnms(dets, thresh):
    bbox = dets[:, :4]
    scores = dets[:, 4]
    info_bbox = dets[:, 5:33] # syn
    
    pts = []
    for i in xrange(dets.shape[0]):
        pts.append([[int(bbox[i, 0]) + info_bbox[i, j], int(bbox[i, 1]) + info_bbox[i, j+1]] for j in xrange(0,28,2)])

    order = scores.argsort()[::-1]
    areas = np.zeros(scores.shape)
    order = scores.argsort()[::-1]
    inter_areas = np.zeros((scores.shape[0], scores.shape[0]))
    for il in xrange(len(pts)):
        poly = Polygon(pts[il])
        areas[il] = poly.area
        for jl in xrange(il, len(pts)):
            polyj = Polygon(pts[jl])
            inS = poly.intersection(polyj)
            inter_areas[il][jl] = inS.area
            inter_areas[jl][il] = inS.area

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)

        ovr = inter_areas[i][order[1:]] / (areas[i] + areas[order[1:]] - inter_areas[i][order[1:]])

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return keep

REF

LNMS代码

INMS代码

PNMS代码