warpAffine 是图像处理中比较常见的一种变换,可以将图像校正或对齐。
对于线性插值方式,OpenCV 首先将坐标映射保存成两张图,然后调用 remap 函数。第二步是比较耗时的部分,并且 warpPerspective 亦采用此处理。
remap 通过构建查找表来存储系数乘积,这样减少了乘法运算次数。
由于篇幅过长,将文章分成 warpAffine 和 remap 两部分。
cv::warpAffine
检查输入通道数量以及插值类型。 如果目的矩阵为空则创建内存,如果是原地操作则复制一份源数据。
CV_INSTRUMENT_REGION();
int interpolation = flags & INTER_MAX;
CV_Assert( _src.channels() <= 4 || (interpolation != INTER_LANCZOS4 &&
interpolation != INTER_CUBIC) );
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() &&
_src.cols() <= SHRT_MAX && _src.rows() <= SHRT_MAX,
ocl_warpTransform_cols4(_src, _dst, _M0, dsize, flags, borderType,
borderValue, OCL_OP_AFFINE))
CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
borderValue, OCL_OP_AFFINE))
Mat src = _src.getMat(), M0 = _M0.getMat();
_dst.create( dsize.empty() ? src.size() : dsize, src.type() );
Mat dst = _dst.getMat();
CV_Assert( src.cols > 0 && src.rows > 0 );
if( dst.data == src.data )
src = src.clone();
如果未设置WARP_INVERSE_MAP,则对参数矩阵M求逆。
double M[6] = {0};
Mat matM(2, 3, CV_64F, M);
if( interpolation == INTER_AREA )
interpolation = INTER_LINEAR;
CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
M0.convertTo(matM, matM.type());
if( !(flags & WARP_INVERSE_MAP) )
{
double D = M[0]*M[4] - M[1]*M[3];
D = D != 0 ? 1./D : 0;
double A11 = M[4]*D, A22=M[0]*D;
M[0] = A11; M[1] *= -D;
M[3] *= -D; M[4] = A22;
double b1 = -M[0]*M[2] - M[1]*M[5];
double b2 = -M[3]*M[2] - M[4]*M[5];
M[2] = b1; M[5] = b2;
}
#if defined (HAVE_IPP) && IPP_VERSION_X100 >= 810 && !IPP_DISABLE_WARPAFFINE
CV_IPP_CHECK()
{
int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
( cn == 1 || cn == 3 || cn == 4 ) &&
( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
{
ippiWarpAffineBackFunc ippFunc = 0;
if ((flags & WARP_INVERSE_MAP) != 0)
{
ippFunc =
type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
0;
}
else
{
ippFunc =
type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
0;
}
int mode =
interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
interpolation == INTER_NEAREST ? IPPI_INTER_NN :
interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
0;
CV_Assert(mode && ippFunc);
double coeffs[2][3];
for( int i = 0; i < 2; i++ )
for( int j = 0; j < 3; j++ )
coeffs[i][j] = matM.at<double>(i, j);
bool ok;
Range range(0, dst.rows);
IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
if( ok )
{
CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
return;
}
setIppErrorStatus();
}
}
#endif
调用命名空间中的 hal::warpAffine 函数,各参数独立传入。
hal::warpAffine(src.type(), src.data, src.step, src.cols, src.rows, dst.data, dst.step, dst.cols, dst.rows,
M, interpolation, borderType, borderValue.val);
CALL_HAL(warpAffine, cv_hal_warpAffine, src_type, src_data, src_step, src_width, src_height, dst_data, dst_step, dst_width, dst_height, M, interpolation, borderType, borderValue);
Mat src(Size(src_width, src_height), src_type, const_cast<uchar*>(src_data), src_step);
Mat dst(Size(dst_width, dst_height), src_type, dst_data, dst_step);
int x;
AutoBuffer<int> _abdelta(dst.cols*2);
int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
for( x = 0; x < dst.cols; x++ )
{
adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
}
创建调用者 WarpAffineInvoker,parallel_for_ 函数并行调用。
Range range(0, dst.rows);
WarpAffineInvoker invoker(src, dst, interpolation, borderType,
Scalar(borderValue[0], borderValue[1], borderValue[2], borderValue[3]),
adelta, bdelta, M);
parallel_for_(range, invoker, dst.total()/(double)(1<<16));
WarpAffineInvoker
public:
WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
const Scalar &_borderValue, int *_adelta, int *_bdelta, const double *_M) :
ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
M(_M)
{
}
operator()
处理操作符函数。
BLOCK_SZ指定分块大小。XY存储源图上对应的坐标,A为辅助系数。INTER_BITS 为5。INTER_TAB_SIZE 为 ,这样双线性插值表大小为32x32x4。AB_SCALE为 。这样计算结果的 10.6 的格式存储。 处理区域为BLOCK_SZ/2行、BLOCK_SZ*2列的矩形。为什么?
virtual void operator() (const Range& range) const CV_OVERRIDE
{
const int BLOCK_SZ = 64;
AutoBuffer<short, 0> __XY(BLOCK_SZ * BLOCK_SZ * 2), __A(BLOCK_SZ * BLOCK_SZ);
short *XY = __XY.data(), *A = __A.data();
const int AB_BITS = MAX(10, (int)INTER_BITS);
const int AB_SCALE = 1 << AB_BITS;
int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
#if CV_TRY_AVX2
bool useAVX2 = CV_CPU_HAS_SUPPORT_AVX2;
#endif
#if CV_TRY_SSE4_1
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
#endif
int bh0 = std::min(BLOCK_SZ/2, dst.rows);
int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
xy指向结果矩阵的当前行行首。X0为 Y0为 round_delta为16。_XY将缓冲区__XY封装成了矩阵。dpart为结果的 RoI 区域。
for( y = range.start; y < range.end; y += bh0 )
{
for( x = 0; x < dst.cols; x += bw0 )
{
int bw = std::min( bw0, dst.cols - x);
int bh = std::min( bh0, range.end - y);
Mat _XY(bh, bw, CV_16SC2, XY), matA;
Mat dpart(dst, Rect(x, y, bw, bh));
for( y1 = 0; y1 < bh; y1++ )
{
short* xy = XY + y1*bw*2;
int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
如果是最近邻方法。WarpAffineInvoker_Blockline_SSE41
if( interpolation == INTER_NEAREST )
{
x1 = 0;
#if CV_TRY_SSE4_1
if( useSSE4_1 )
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta + x, bdelta + x, xy, X0, Y0, bw);
else
#endif
{
#if CV_SIMD128
{
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
int span = v_uint16x8::nlanes;
for( ; x1 <= bw - span; x1 += span )
{
v_int16x8 v_dst[2];
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(shift+v_load(ptr + offset)),\
v_shr<AB_BITS>(shift+v_load(ptr + offset + 4)))
v_dst[0] = CV_CONVERT_MAP(adelta, x+x1, v_X0);
v_dst[1] = CV_CONVERT_MAP(bdelta, x+x1, v_Y0);
#undef CV_CONVERT_MAP
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
}
}
#endif
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> AB_BITS;
int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
xy[x1*2] = saturate_cast<short>(X);
xy[x1*2+1] = saturate_cast<short>(Y);
}
}
}
对于双线性插值。alpha指向当前行。 avx2可以调用 warpAffineBlocklinev_mask为向量掩码。 每次处理span*2个数。v_setall_s32 通过 OPENCV_HAL_IMPL_NEON_INIT 宏来定义。v_load 从存储器中加载寄存器内容。 计算 和 时,是两个v_int32x4相加(vaddq_s32),得到v_int32x4的结果。然后右移 (10-5)位。v_shr 右移。v_pack 将两个数据宽度减半后拼到一起。 两次右移后v_store_interleave 将2个寄存器中的数据交错存储到内存中。v_shl 左移。v_store 将寄存器内容存储到内存中。v_Y0和v_X0合并。
else
{
short* alpha = A + y1*bw;
x1 = 0;
#if CV_TRY_AVX2
if ( useAVX2 )
x1 = opt_AVX2::warpAffineBlockline(adelta + x, bdelta + x, xy, alpha, X0, Y0, bw);
#endif
#if CV_SIMD128
{
v_int32x4 v__X0 = v_setall_s32(X0), v__Y0 = v_setall_s32(Y0);
v_int32x4 v_mask = v_setall_s32(INTER_TAB_SIZE - 1);
int span = v_float32x4::nlanes;
for( ; x1 <= bw - span * 2; x1 += span * 2 )
{
v_int32x4 v_X0 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1));
v_int32x4 v_Y0 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1));
v_int32x4 v_X1 = v_shr<AB_BITS - INTER_BITS>(v__X0 + v_load(adelta + x + x1 + span));
v_int32x4 v_Y1 = v_shr<AB_BITS - INTER_BITS>(v__Y0 + v_load(bdelta + x + x1 + span));
v_int16x8 v_xy[2];
v_xy[0] = v_pack(v_shr<INTER_BITS>(v_X0), v_shr<INTER_BITS>(v_X1));
v_xy[1] = v_pack(v_shr<INTER_BITS>(v_Y0), v_shr<INTER_BITS>(v_Y1));
v_store_interleave(xy + (x1 << 1), v_xy[0], v_xy[1]);
v_int32x4 v_alpha0 = v_shl<INTER_BITS>(v_Y0 & v_mask) | (v_X0 & v_mask);
v_int32x4 v_alpha1 = v_shl<INTER_BITS>(v_Y1 & v_mask) | (v_X1 & v_mask);
v_store(alpha + x1, v_pack(v_alpha0, v_alpha1));
}
}
#endif
X和Y为映射到源图上的像素坐标。alpha将Y和X的小数部分合并存储。
for( ; x1 < bw; x1++ )
{
int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
(X & (INTER_TAB_SIZE-1)));
}
}
}
cv::remap 从src中找到dpart的源值。最近邻方式不需要第二张图。
if( interpolation == INTER_NEAREST )
remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
else
{
Mat _matA(bh, bw, CV_16U, A);
remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
}
}
}
}
成员变量。
private:
Mat src;
Mat dst;
int interpolation, borderType;
Scalar borderValue;
int *adelta, *bdelta;
const double *M;
参考资料:
Warp Affine
- How to work with OpenCV application based on OpenVX API
- OpenVX sample #7163
- opencv/cmake/FindOpenVX.cmake
- Hybrid CV/DL pipelines with OpenCV 4.4 G-API
- opencv和openvx
- Showing our vision with OpenVX™ 1.1 support for PowerVR GPUs
- Amdovx Core
- OpenVX 1.3 is Here!
- Color Copy OpenVX* Sample
- opencv ncnn warpaffine 性能测试
- OpenCV warpAffine的天坑
- Geometric Transformations of Images
- Decompose a 2D arbitrary transform into only scaling and rotation
- Decompose affine transformation (including shear in x and y)
- Decomposing an Affine transformation
- Decomposing an Affine transformation
- Given this transformation matrix, how do I decompose it into translation, rotation and scale matrices?
- Opencv 仿射变换原理代码解析
- 如何通俗地讲解「仿射变换」这个概念?
- OpenCV代码提取:warpAffine函数的实现
- NEON优化——OpenCV WarpAffine最近邻
- Subtracting 0x8000 from an int
- OpenCV2:Mat属性type,depth,step
- OpenCV中如何获取Mat类型中的步长stride及分析 C++实现
- NEON is the new black: fast JPEG optimization on ARM server
- 大端序与小端序
- Remapping
- C++ 命令模式讲解和代码示例
- Specify an origin to warpPerspective() function in OpenCV 2.x
- 计算机视觉2D几何基元及其变换介绍和OpenCV WarpPerspective源码分析