.cpu generic+fp+simd .file "mp_mul_256.c" .text .align 2 .global mp_mul_256 .type mp_mul_256, %function mp_mul_256: .LFB0: .cfi_startproc // --- ldr x4, [x1] //x4 = a[0] ldr x5, [x1,8] //x5 = a[1] ldr x6, [x1,16] //x6 = a[2] ldr x7, [x1,24] //x7 = a[3] // --- ldr x12, [x2] //x12 = b[0] ldr x13, [x2,8] //x13 = b[1] ldr x14, [x2,16] //x14 = b[2] ldr x15, [x2,24] //x15 = b[3] // round 0 mul x1, x4, x12 //a[0]*b[0] umulh x2, x4, x12 //a[0]*b[0] mov x3, 0 str x1, [x0] //save c[0] // round 1 mul x10, x4, x13 //a[0]*b[1] umulh x11, x4, x13 //a[0]*b[1] mov x1, 0 adds x2, x10, x2 adcs x3, x11, x3 adc x1, xzr, x1 mul x10, x5, x12 //a[1]*b[0] umulh x11, x5, x12 //a[1]*b[0] adds x2, x10, x2 adcs x3, x11, x3 adc x1, xzr, x1 str x2, [x0,8] //save c[1] // round 2 mul x10, x4, x14 //a[0]*b[2] umulh x11, x4, x14 //a[0]*b[2] mov x2, 0 adds x3, x10, x3 adcs x1, x11, x1 adc x2, xzr, x2 mul x10, x5, x13 //a[1]*b[1] umulh x11, x5, x13 //a[1]*b[1] adds x3, x10, x3 adcs x1, x11, x1 adc x2, xzr, x2 mul x10, x6, x12 //a[2]*b[0] umulh x11, x6, x12 //a[2]*b[0] adds x3, x10, x3 adcs x1, x11, x1 adc x2, xzr, x2 str x3, [x0,16] //save c[2] // round 3 mul x10, x4, x15 //a[0]*b[3] umulh x11, x4, x15 //a[0]*b[3] mov x3, 0 adds x1, x10, x1 adcs x2, x11, x2 adc x3, xzr, x3 mul x10, x5, x14 //a[1]*b[2] umulh x11, x5, x14 //a[1]*b[2] adds x1, x10, x1 adcs x2, x11, x2 adc x3, xzr, x3 mul x10, x6, x13 //a[2]*b[1] umulh x11, x6, x13 //a[2]*b[1] adds x1, x10, x1 adcs x2, x11, x2 adc x3, xzr, x3 mul x10, x7, x12 //a[3]*b[0] umulh x11, x7, x12 //a[3]*b[0] adds x1, x10, x1 adcs x2, x11, x2 adc x3, xzr, x3 str x1, [x0,24] //save c[3] // round 4 mul x10, x5, x15 //a[1]*b[3] umulh x11, x5, x15 //a[1]*b[3] mov x1, 0 adds x2, x10, x2 adcs x3, x11, x3 adc x1, xzr, x1 mul x10, x6, x14 //a[2]*b[2] umulh x11, x6, x14 //a[2]*b[2] adds x2, x10, x2 adcs x3, x11, x3 adc x1, xzr, x1 mul x10, x7, x13 //a[3]*b[1] umulh x11, x7, x13 //a[3]*b[1] adds x2, x10, x2 adcs x3, x11, x3 adc x1, xzr, x1 str x2, [x0,32] //save c[4] // round 5 mul x10, x6, x15 //a[2]*b[3] umulh x11, x6, x15 //a[2]*b[3] mov x2, 0 adds x3, x10, x3 adcs x1, x11, x1 adc x2, xzr, x2 mul x10, x7, x14 //a[3]*b[2] umulh x11, x7, x14 //a[3]*b[2] adds x3, x10, x3 adcs x1, x11, x1 adc x2, xzr, x2 str x3, [x0,40] //save c[5] // round 5 mul x10, x7, x15 //a[3]*b[3] umulh x11, x7, x15 //a[3]*b[3] adds x1, x10, x1 adcs x2, x11, x2 str x1, [x0,48] //save c[6] str x2, [x0,56] //save c[7] // --- mov x0, 0 ret .cfi_endproc .LFE0: .size mp_mul_256, .-mp_mul_256 .ident "GCC: (GNU) 4.8.5 20150623 (Red Hat 4.8.5-39)" .section .note.GNU-stack,"",%progbits
.file "mp_mul_256.c" .text .p2align 4,,15 .globl mp_mul_256 .type mp_mul_256, @function mp_mul_256: .LFB0: # uint64_t mp_mul_256(uint64_t c[8], uint64_t a[4], uint64_t b[4]) # rdi = c[] # rsi = a[] # rdx = b[] .cfi_startproc # -------------------------- # backup rbx,r12,r13,r14,r15 # -------------------------- movq %rbx, %xmm1 movq %r12, %xmm2 movq %r13, %xmm3 movq %r14, %xmm4 movq %r15, %xmm5 # -------------------------------- # load a[0,1,2,3] to r8,r9,r10,r11 # -------------------------------- movq (%rsi), %r8 movq 8(%rsi), %r9 movq 16(%rsi), %r10 movq 24(%rsi), %r11 # ---------------------------------- # load b[0,1,2,3] to r12,r13,r14,r15 # ---------------------------------- movq (%rdx), %r12 movq 8(%rdx), %r13 movq 16(%rdx), %r14 movq 24(%rdx), %r15 # round 0 xorq %rbx, %rbx xorq %rcx, %rcx movq %r8, %rax #rax = a[0] mulq %r12 #a[0]*b[0] # ---- movq %rax, (%rdi) #save c[0] movq %rdx, %rsi # round 1 movq %r8, %rax #rax = a[0] mulq %r13 #a[0]*b[1] addq %rax, %rsi adcq %rdx, %rbx # ---- movq %r9, %rax #rax = a[1] mulq %r12 #a[1]*b[0] addq %rax, %rsi adcq %rdx, %rbx adcq $0, %rcx # ---- movq %rsi, 8(%rdi) #save c[1] # round 2 movq %r8, %rax #rax = a[0] mulq %r14 #a[0]*b[2] xorq %rsi, %rsi addq %rax, %rbx adcq %rdx, %rcx adcq $0, %rsi # ---- movq %r9, %rax #rax = a[1] mulq %r13 #a[1]*b[1] addq %rax, %rbx adcq %rdx, %rcx adcq $0, %rsi # ---- movq %r10, %rax #rax = a[2] mulq %r12 #a[2]*b[0] addq %rax, %rbx adcq %rdx, %rcx adcq $0, %rsi # ---- movq %rbx, 16(%rdi) #save c[2] # round 3 movq %r8, %rax #rax = a[0] mulq %r15 #a[0]*b[3] xorq %rbx, %rbx addq %rax, %rcx adcq %rdx, %rsi adcq $0, %rbx # ---- movq %r9, %rax #rax = a[1] mulq %r14 #a[1]*b[2] addq %rax, %rcx adcq %rdx, %rsi adcq $0, %rbx # ---- movq %r10, %rax #rax = a[2] mulq %r13 #a[2]*b[1] addq %rax, %rcx adcq %rdx, %rsi adcq $0, %rbx # ---- movq %r11, %rax #rax = a[3] mulq %r12 #a[3]*b[0] addq %rax, %rcx adcq %rdx, %rsi adcq $0, %rbx # ---- movq %rcx, 24(%rdi) #save c[3] # round 4 movq %r9, %rax #rax = a[1] mulq %r15 #a[1]*b[3] xorq %rcx, %rcx addq %rax, %rsi adcq %rdx, %rbx adcq $0, %rcx # ---- movq %r10, %rax #rax = a[2] mulq %r14 #a[2]*b[2] addq %rax, %rsi adcq %rdx, %rbx adcq $0, %rcx # ---- movq %r11, %rax #rax = a[3] mulq %r13 #a[3]*b[1] addq %rax, %rsi adcq %rdx, %rbx adcq $0, %rcx # ---- movq %rsi, 32(%rdi) #save c[4] # round 5 movq %r10, %rax #rax = a[2] mulq %r15 #a[2]*b[3] xorq %rsi, %rsi addq %rax, %rbx adcq %rdx, %rcx adcq $0, %rsi # ---- movq %r11, %rax #rax = a[3] mulq %r14 #a[3]*b[2] addq %rax, %rbx adcq %rdx, %rcx adcq $0, %rsi # ---- movq %rbx, 40(%rdi) #save c[5] # round 6 movq %r11, %rax #rax = a[3] mulq %r15 #a[3]*b[3] addq %rax, %rcx adcq %rdx, %rsi # ---- movq %rcx, 48(%rdi) #save c[6] movq %rsi, 56(%rdi) #save c[7] # --------------------------- # restore rbx,r12,r13,r14,r15 # --------------------------- movq %xmm1, %rbx movq %xmm2, %r12 movq %xmm3, %r13 movq %xmm4, %r14 movq %xmm5, %r15 # ---- # done # ---- emms xorq %rax, %rax ret .cfi_endproc .LFE0: .size mp_mul_256, .-mp_mul_256 .ident "GCC: (GNU) 4.4.7 20120313 (Red Hat 4.4.7-23)" .section .note.GNU-stack,"",@progbits