SSE架构提供对打包单精度浮点值的SIMD支持。数据传送到XMM寄存器中。
SSE指令有两个版本,后缀PS和后缀SS。PS是对打包单精度浮点值执行类型的运算操作,每个值都参与。SS结尾的,只对打包值中的低位双字执行。
传送数据
传送单精度浮点值很大程度上依赖于值是否在内存中对准了。
MOVAPS指令要求数据在内存中对准16字节边界。如果不对准会出现分段错误。
gas汇编器使用.align命令来把数据对准特定的内存边界。
处理数据
运算指令如下;
这些指令都是用两个操作数,源操作数可以是128位内存或者XMM寄存器,目标操作数必须是XMM寄存器。
sse示例
.section .data
.align 16
value1:
.float 12.34, 2345., -93.2, 10.44
value2:
.float 39.234, 21.4, 100.94, 10.56
.section .bss
.lcomm result, 16
.section .text
.globl _start
_start:
nop
movaps value1, %xmm0
movaps value2, %xmm1
addps %xmm1, %xmm0
sqrtps %xmm0, %xmm0
maxps %xmm1, %xmm0
movaps %xmm0, result
movl $60, %eax
movl $0, %ebx
syscall
把单精度浮点值加载到XMM寄存器中。并执行基本运算操作,XMM0结果被传送回内存中使用标签result标记的位置中。
as -g -o ssemath.o ssemath.s
ld -o ssemath ssemath.o
使用gdb进行过程中的查看:
(gdb) print $xmm0
$3 = {v4_float = {12.3400002, 2345, -93.1999969, 10.4399996}, v2_double = {5.6101725574714474e+24, 754974.88032836909}, v16_int8 = {-92, 112, 69, 65, 0, -112, 18, 69, 102, 102, -70,
-62, 61, 10, 39, 65}, v8_int16 = {28836, 16709, -28672, 17682, 26214, -15686, 2621, 16679}, v4_int32 = {1095069860, 1158844416, -1027971482, 1093077565}, v2_int64 = {
4977198868967288996, 4694732396933310054}, uint128 = 86602527020781775578804592404012036260}
(gdb) print $xmm1
$4 = {v4_float = {39.2340012, 21.3999996, 100.940002, 10.5600004}, v2_double = {228170145.05651563, 817889.63044647221}, v16_int8 = {-98, -17, 28, 66, 51, 51, -85, 65, 72, -31, -55,
66, -61, -11, 40, 65}, v8_int16 = {-4194, 16924, 13107, 16811, -7864, 17097, -2621, 16680}, v4_int32 = {1109192606, 1101738803, 1120526664, 1093203395}, v2_int64 = {
4731932128728379294, 4695272830521696584}, uint128 = 86612496260875578387995842694967259038}
相加addps后:
(gdb) s
18 sqrtps %xmm0, %xmm0
(gdb) print $xmm0
$5 = {v4_float = {51.5740013, 2366.3999, 7.74000549, 21}, v2_double = {6.014405302368266e+24, 201326624.48375034}, v16_int8 = {-57, 75, 78, 66, 102, -26, 19, 69, 32, -82, -9, 64, 0,
0, -88, 65}, v8_int16 = {19399, 16974, -6554, 17683, -20960, 16631, 0, 16808}, v4_int32 = {1112427463, 1158932070, 1089973792, 1101529088}, v2_int64 = {4977575340048010183,
4731031409642679840}, uint128 = 87272125618359850373392885123090631623}
sqrtps后,求平方:
(gdb) s
19 maxps %xmm1, %xmm0
(gdb) print $xmm0
$6 = {v4_float = {7.18150425, 48.6456566, 2.78208661, 4.5825758}, v2_double = {159623578059.61627, 1193.1154792614809}, v16_int8 = {-30, -50, -27, 64, 39, -107, 66, 66, -75, 13, 50,
64, 118, -92, -110, 64}, v8_int16 = {-12574, 16613, -27353, 16962, 3509, 16434, -23434, 16530}, v4_int32 = {1088802530, 1111659815, 1077022133, 1083352182}, v2_int64 = {
4774542550791212770, 4652962192817262005}, uint128 = 85832002755546427910696780764134362850}
maxps后,计算两个打包值中最大值:
(gdb) s
20 movaps %xmm0, result
(gdb) print $xmm0
$7 = {v4_float = {39.2340012, 48.6456566, 100.940002, 10.5600004}, v2_double = {159623578681.87201, 817889.63044647221}, v16_int8 = {-98, -17, 28, 66, 39, -107, 66, 66, 72, -31,
-55, 66, -61, -11, 40, 65}, v8_int16 = {-4194, 16924, -27353, 16962, -7864, 17097, -2621, 16680}, v4_int32 = {1109192606, 1111659815, 1120526664, 1093203395}, v2_int64 = {
4774542550811602846, 4695272830521696584}, uint128 = 86612496260875578388038453117050482590}
movaps后,result得到结果:
(gdb) x /4f &result
0x600100 <result>: 39.2340012 48.6456566 100.940002 10.5600004
比较指令
SSE的比较指令同MMX比较指令类似,单独比较128位打包单精度浮点值的每个元素。
其中CMPSS有3个操作数
CMPPS imp, source,destination
imp可以有如下:
结果是位掩码,存放在寄存器XMM0中。
Gas汇编器提供了替代imp操作数的伪指令,如下:
比较示例
.section .data
.align 16
value1:
.float 12.34, 2345., -93.2, 10.44
value2:
.float 12.34, 21.4, -93.2, 10.45
.section .bss
.lcomm result, 16
.section .text
.globl _start
_start:
nop
movaps value1, %xmm0
movaps value2, %xmm1
cmpeqps %xmm1, %xmm0
movaps %xmm0, result
movl $60, %eax
movl $0, %ebx
syscall
as -g -o ssecomp.o ssecomp.s
ld -o ssecomp ssecomp.o
比较前:
(gdb) print $xmm0
$1 = {v4_float = {12.3400002, 2345, -93.1999969, 10.4399996}, v2_double = {5.6101725574714474e+24, 754974.88032836909}, v16_int8 = {-92, 112, 69, 65, 0, -112, 18, 69, 102, 102, -70,
-62, 61, 10, 39, 65}, v8_int16 = {28836, 16709, -28672, 17682, 26214, -15686, 2621, 16679}, v4_int32 = {1095069860, 1158844416, -1027971482, 1093077565}, v2_int64 = {
4977198868967288996, 4694732396933310054}, uint128 = 86602527020781775578804592404012036260}
(gdb) print $xmm1
$2 = {v4_float = {12.3400002, 21.3999996, -93.1999969, 10.4499998}, v2_double = {228170144.635625, 760217.88032836909}, v16_int8 = {-92, 112, 69, 65, 51, 51, -85, 65, 102, 102, -70,
-62, 51, 51, 39, 65}, v8_int16 = {28836, 16709, 13107, 16811, 26214, -15686, 13107, 16679}, v4_int32 = {1095069860, 1101738803, -1027971482, 1093088051}, v2_int64 = {
4731932128714256548, 4694777433960375910}, uint128 = 86603357807293900154403331565622227108}
比较后:
(gdb) print $xmm0
$3 = {v4_float = {-nan(0x7fffff), 0, -nan(0x7fffff), 0}, v2_double = {2.1219957904712067e-314, 2.1219957904712067e-314}, v16_int8 = {-1, -1, -1, -1, 0, 0, 0, 0, -1, -1, -1, -1, 0,
0, 0, 0}, v8_int16 = {-1, -1, 0, 0, -1, -1, 0, 0}, v4_int32 = {-1, 0, -1, 0}, v2_int64 = {4294967295, 4294967295}, uint128 = 79228162495817593524129366015}
(gdb) x /4x &result
0x600100 <result>: 0xffffffff 0x00000000 0xffffffff 0x00000000
表示第一个和第三个整数相等。
sse整数指令
sse提供处理64位打包整数值的一些扩展特性。扩展了MMX提供的功能。如下: