SIMD and Avx2

比眉伴天荒 2022-10-05 00:49 231阅读 0赞

SIMD 一条指令可以执行多个数据group的计算和输出。对于SIMD相对应的SISD.
intel SSE2 , AVX2, AVX-512

假设有一个任务是统计字符串中每一个字符出现的次数,我们可以用128bit 的SISD指令进行统计。每8个bit代表一个字符,所以只需要两个SIMD指令(move mask、pop count)。
在这里插入图片描述

详细测试:

  1. #include <stdio.h>
  2. #include <thread>
  3. #define INC_TO 1000000 // one million...
  4. #include <mutex>
  5. #include <functional>
  6. #include <atomic>
  7. #include <vector>
  8. #include <sstream>
  9. #include <iostream>
  10. #include <emmintrin.h>
  11. #include <immintrin.h>
  12. #include <assert.h>
  13. #include <x86intrin.h>
  14. struct StringView {
  15. char* buffer;
  16. size_t len;
  17. };
  18. void RandomGeneratorFile(const char* filename) {
  19. FILE* fp = fopen(filename, "w");
  20. const size_t numbers = 16*8*1000;
  21. size_t count =0;
  22. do {
  23. for (char i = 'A'; i < 'Z'; i++) {
  24. fputc(i, fp);
  25. }
  26. count++;
  27. } while (count < numbers);
  28. fclose(fp);
  29. }
  30. StringView* GetFileContent(const char* filename) {
  31. FILE* fp = fopen(filename, "r");
  32. /*Move file point at the end of file.*/
  33. fseek(fp,0,SEEK_END);
  34. /*Get the current position of the file pointer.*/
  35. size_t size=ftell(fp);
  36. printf("file size:%d\n", size);
  37. char * buffer = new char[size];
  38. fseek(fp, 0, SEEK_SET);
  39. fread(buffer, size, size, fp);
  40. // printf("content of buffer:%s\n", buffer);
  41. fclose(fp);
  42. StringView* str = new StringView();
  43. str->buffer = buffer;
  44. str->len = size;
  45. return str;
  46. }
  47. // 正常统计字符串
  48. size_t count_chars_8(const char* data, size_t size, const char ch)
  49. {
  50. size_t total = 0;
  51. while (size) {
  52. if (*data == ch)
  53. total += 1;
  54. data += 1;
  55. size -= 1;
  56. }
  57. return total;
  58. }
  59. // SIMD
  60. size_t count_chars_128(const char* data, size_t size, const char ch)
  61. {
  62. size_t total = 0;
  63. assert(size % 16 == 0);
  64. // 将ch广播16次
  65. __m128i tocmp = _mm_set1_epi8(ch);
  66. while (size) {
  67. int mask = 0;
  68. // 从memory 取出128bit数据
  69. __m128i chunk = _mm_load_si128 ((__m128i const*)data);
  70. // 对128bit数据进行比较, 返回16bit
  71. __m128i results = _mm_cmpeq_epi8(chunk, tocmp);
  72. //
  73. mask = _mm_movemask_epi8(results);
  74. //统计int32的bit位是1的值
  75. // _mm_ prefix, because it does not operate on 128-bit registers, it just operates on standard 64-bit registers.
  76. total += _popcnt32(mask);
  77. data += 16;
  78. size -= 16;
  79. }
  80. return total;
  81. }
  82. // AVX
  83. size_t count_chars_avx(const char* data, size_t size, const char ch)
  84. {
  85. size_t total = 0;
  86. assert(size % 16 == 0);
  87. __m256i tocmp = _mm256_set1_epi8(ch);
  88. while(size) {
  89. __m256i chunk = _mm256_loadu_si256((__m256i*)data);
  90. __m256i results = _mm256_cmpeq_epi8(tocmp, chunk);
  91. unsigned mask = _mm256_movemask_epi8(results);
  92. total += __builtin_popcount(mask);
  93. data += 32;
  94. size -= 32;
  95. }
  96. // printf("count:%d\n", total);
  97. return total;
  98. }
  99. static void print_time_us(const char* name,
  100. size_t(*fn)(const char*, size_t, const char), const char* a,
  101. size_t size, const char ch) {
  102. struct timespec start, end;
  103. clock_gettime(CLOCK_MONOTONIC_RAW, &start);
  104. fn(a, size, ch);
  105. clock_gettime(CLOCK_MONOTONIC_RAW, &end);
  106. uint64_t delta_us = (end.tv_sec - start.tv_sec) * 1000000 + (end.tv_nsec - start.tv_nsec) / 1000;
  107. printf("Running: '%s' took %llu u/s\n", name, delta_us);
  108. }
  109. int main()
  110. {
  111. RandomGeneratorFile("test.file");
  112. StringView* str = GetFileContent("test.file");
  113. print_time_us("NORMAL", count_chars_8, str->buffer, str->len, 'A');
  114. print_time_us("SIMD", count_chars_128, str->buffer, str->len, 'A');
  115. print_time_us("AVX", count_chars_avx, str->buffer, str->len, 'A');
  116. return 0;
  117. }

编译命令:g++ -std=c++14 main.cc -o main -mavx -mavx2 -O2

输出:

  1. file size:3200000
  2. Running: 'NORMAL' took 2505 u/s
  3. Running: 'SIMD' took 192 u/s
  4. Running: 'AVX' took 97 u/s

发表评论

表情:
评论列表 (有 0 条评论,231人围观)

还没有评论,来说两句吧...

相关阅读