armv8,c,neon

code

#include<stdio.h>
#include<stdlib.h>
#include<time.h>
#include<arm_neon.h>
#include<math.h>

double get_current_time()
{
    struct timeval tv;
    gettimeofday(&tv, NULL);

    return tv.tv_sec * 1000.0 + tv.tv_usec / 1000.0;
}

void abs_c(float* src, float* out, int count)
{
    for (size_t i = 0; i < count; i++)
    {
        out[i] = fabs(src[i]);
    }
    
}

void abs_neon(float* src, float* out, int count)
{
    float32x4_t a, c;
    for (size_t i = 0; i < count; i+=4)
    {
        a = vld1q_f32(src);
        c = vabsq_f32(a);
        vst1q_f32(out, c);
        src += 4;
        out += 4;
    }
}

void abs_assembly(float* src, float* out, int count)
{
    int i = 10;
    asm volatile(
        "1:                         \n"
        "prfm pldl1keep, [%1, #128] \n"
        "ld1  {v0.4s}, [%1], #16    \n"
        "fabs  v0.4s, v0.4s         \n"
        "subs %2, %2, #4            \n"
        "st1  {v0.4s}, [%0], #16    \n"
        "bgt 1b                     \n "
        :"=r"(out)                 // 出现在输出列表中的变量,必须出现在输入列表中
        :"r"(src),
        "r"(count),
        "0"(out)
        :"cc", "memory", "v0"
    );    
}

int main(void){
    int num_ = 160000;
    int loop = 2;
    double start, end, cur;

    float* src_a = (float*)malloc(sizeof(float) * num_);
    float* src_b = (float*)malloc(sizeof(float) * num_);

    for (size_t i = 0; i < num_; i++)
    {
        src_a[i] = (rand() / (RAND_MAX + 1.0)) * 2 - 1;
        src_b[i] = (rand() / (RAND_MAX + 1.0)) * 2 - 1;
    }

    #ifdef __aarch64__          // 宏定义在编译器中
      printf("test on aarch64 plateform \n");
    #endif

    #ifdef __ARM_NEON           // 宏定义在arm gcc 编译器中的
      printf("test on ARM platform \n");
    #endif

    // warm up
    for (size_t i = 0; i < 10; i++)
        abs_c(src_a, src_b, num_);
    

    // test for c
    start = get_current_time();
    for (size_t i = 0; i < loop; i++)
        abs_c(src_a, src_b, num_);
    end = get_current_time();
    cur = (end - start) / loop;

    printf("c test:%f | time:%f ms \n", 0., cur);
    
    // test for neon 
    start = get_current_time();
    for (size_t i = 0; i < loop; i++)
        abs_neon(src_a, src_b, num_);
    end = get_current_time();
    cur = (end - start) / loop;
    printf("neon:%f | time:%f ms \n", 1., cur);

    // // test for neon assembly
    // for (size_t i = 0; i < num_; i++)
    //     printf("%f ", src_a[i]);
    // printf("\n");
    // for (size_t i = 0; i < num_; i++)
    //     printf("%f ", src_b[i]);

    start = get_current_time();
    for (size_t i = 0; i < loop; i++)
        abs_assembly(src_a, src_b, num_);
    end = get_current_time();
    cur = (end - start) / loop;
    printf("assembly:%f | time:%f ms \n", 2., cur);

    // // debug show
    // for (size_t i = 0; i < num_; i++)
    //     printf("%f ", src_a[i]);
    // printf("\n");
    // for (size_t i = 0; i < num_; i++)
    //     printf("%f ", src_b[i]);

    return 0;
}

输出
c test:0.000000 | time:1.094116 ms
neon:1.000000 | time:0.740479 ms
assembly:2.000000 | time:0.140991 ms