diff options
Diffstat (limited to 'avx2.h')
-rw-r--r-- | avx2.h | 355 |
1 files changed, 355 insertions, 0 deletions
@@ -0,0 +1,355 @@ +/* PIPAPO - PIle PAcket POlicies + * + * avx2.h - Lookup routines based on AVX2 intrinsics + * + * Author: Stefano Brivio <sbrivio@redhat.com> + * License: GPLv2 + */ + +#include <immintrin.h> +#include <stdint.h> + +#define AVX_LOAD(lt, g, v, bsize) \ + _mm256_stream_load_si256((__m256i *)(lt + (g * BUCKETS + (v)) * bsize)); + +/** + * avx2_lookup4() - AVX2-based lookup for packet fields of 4 four-bit groups + * @map: Previous matching bitmap that will be updated + * @lt: Lookup table for this field + * @pkt: Packet bytes to be matched + * @bsize: Bucket size for this lookup table, in bytes + * @first: If this is the first field in a set, start with all-ones bitmap + * @last: Last field: stop on the first match and return position + * + * Load buckets from lookup table corresponding to the values of each 4-bit + * group of packet bytes, and perform a bitwise intersection between them. If + * this is the first field in the set, simply AND the buckets together + * (equivalent to using an all-ones starting bitmap), use the provided starting + * bitmap otherwise. Then store the resulting match bitmap in @map. + * + * This implements steps 4.1 to 4.3 of the algorithm description, and is used + * for 16-bit fields (i.e. ports). + * + * Return: 32-byte rounded position of match, if last field, otherwise 0 on + * match and -1 on no match. + */ +__always_inline int avx2_lookup4(uint8_t *map, uint8_t *lt, uint8_t *pkt, + int bsize, int first, int last) +{ + __m256i r0, r1, r2, r3, r4; + int i, ret = -1; + + for (i = 0; i < bsize; i += 32) { + r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize); + r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize); + r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize); + r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize); + + if (first) { + r4 = _mm256_and_si256(r1, r0); + } else { + r4 = _mm256_stream_load_si256((__m256i *)(map + i)); + + r4 = _mm256_and_si256(r0, r4); + r4 = _mm256_and_si256(r1, r4); + } + r4 = _mm256_and_si256(r2, r4); + r4 = _mm256_and_si256(r3, r4); + + if (!_mm256_testz_si256(r4, r4)) { + if (last) { + _mm256_store_si256((__m256i *)(map + i), r4); + return i; + } + ret = 0; + } + _mm256_store_si256((__m256i *)(map + i), r4); + } + + return ret; +} + +/** + * avx2_lookup8() - AVX2-based lookup for packet fields of 8 four-bit groups + * @map: Previous matching bitmap that will be updated + * @lt: Lookup table for this field + * @pkt: Packet bytes to be matched + * @bsize: Bucket size for this lookup table, in bytes + * @first: If this is the first field in a set, start with all-ones bitmap + * @last: Last field: stop on the first match and return position + * + * Load buckets from lookup table corresponding to the values of each 4-bit + * group of packet bytes, and perform a bitwise intersection between them. If + * this is the first field in the set, simply AND the buckets together + * (equivalent to using an all-ones starting bitmap), use the provided starting + * bitmap otherwise. Then store the resulting match bitmap in @map. + * + * This implements steps 4.1 to 4.3 of the algorithm description, and is used + * for 32-bit fields (i.e. IPv4 addresses). + * + * Return: 32-byte rounded position of match, if last field, otherwise 0 on + * match and -1 on no match. + */ +__always_inline int avx2_lookup8(uint8_t *map, uint8_t *lt, uint8_t *pkt, + int bsize, int first, int last) +{ + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8; + int i, ret = -1; + + for (i = 0; i < bsize; i += 32) { + r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize); + r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize); + r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize); + r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize); + r4 = AVX_LOAD(lt + i, 4, pkt[2] >> 4, bsize); + r5 = AVX_LOAD(lt + i, 5, pkt[2] & 0x0f, bsize); + r6 = AVX_LOAD(lt + i, 6, pkt[3] >> 4, bsize); + r7 = AVX_LOAD(lt + i, 7, pkt[3] & 0x0f, bsize); + + if (first) { + r8 = _mm256_and_si256(r1, r0); + } else { + r8 = _mm256_stream_load_si256((__m256i *)(map + i)); + r8 = _mm256_and_si256(r0, r8); + r8 = _mm256_and_si256(r1, r8); + } + r8 = _mm256_and_si256(r2, r8); + r8 = _mm256_and_si256(r3, r8); + r8 = _mm256_and_si256(r4, r8); + r8 = _mm256_and_si256(r5, r8); + r8 = _mm256_and_si256(r6, r8); + r8 = _mm256_and_si256(r7, r8); + + if (!_mm256_testz_si256(r8, r8)) { + if (last) { + _mm256_store_si256((__m256i *)(map + i), r8); + return i; + } + ret = 0; + } + _mm256_store_si256((__m256i *)(map + i), r8); + } + + return ret; +} + +/** + * avx2_lookup12() - AVX2-based lookup for packet fields of 12 four-bit groups + * @map: Previous matching bitmap that will be updated + * @lt: Lookup table for this field + * @pkt: Packet bytes to be matched + * @bsize: Bucket size for this lookup table, in bytes + * @first: If this is the first field in a set, start with all-ones bitmap + * @last: Last field: stop on the first match and return position + * + * Load buckets from lookup table corresponding to the values of each 4-bit + * group of packet bytes, and perform a bitwise intersection between them. If + * this is the first field in the set, simply AND the buckets together + * (equivalent to using an all-ones starting bitmap), use the provided starting + * bitmap otherwise. Then store the resulting match bitmap in @map. + * + * This implements steps 4.1 to 4.3 of the algorithm description, and is used + * for 48-bit fields (i.e. MAC addresses). + * + * Return: 32-byte rounded position of match, if last field, otherwise 0 on + * match and -1 on no match. + */ +__always_inline int avx2_lookup12(uint8_t *map, uint8_t *lt, uint8_t *pkt, + int bsize, int first, int last) +{ + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12; + int i, ret = -1; + + for (i = 0; i < bsize; i += 32) { + r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize); + r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize); + r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize); + r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize); + r4 = AVX_LOAD(lt + i, 4, pkt[2] >> 4, bsize); + r5 = AVX_LOAD(lt + i, 5, pkt[2] & 0x0f, bsize); + r6 = AVX_LOAD(lt + i, 6, pkt[3] >> 4, bsize); + r7 = AVX_LOAD(lt + i, 7, pkt[3] & 0x0f, bsize); + r8 = AVX_LOAD(lt + i, 8, pkt[4] >> 4, bsize); + r9 = AVX_LOAD(lt + i, 9, pkt[4] & 0x0f, bsize); + r10 = AVX_LOAD(lt + i, 10, pkt[5] >> 4, bsize); + r11 = AVX_LOAD(lt + i, 11, pkt[5] & 0x0f, bsize); + + if (first) { + r12 = _mm256_and_si256(r0, r1); + } else { + r12 = _mm256_stream_load_si256((__m256i *)(map + i)); + r12 = _mm256_and_si256(r0, r12); + r12 = _mm256_and_si256(r1, r12); + } + + r12 = _mm256_and_si256(r0, r12); + r12 = _mm256_and_si256(r1, r12); + r12 = _mm256_and_si256(r2, r12); + r12 = _mm256_and_si256(r3, r12); + r12 = _mm256_and_si256(r4, r12); + r12 = _mm256_and_si256(r5, r12); + r12 = _mm256_and_si256(r6, r12); + r12 = _mm256_and_si256(r7, r12); + r12 = _mm256_and_si256(r8, r12); + r12 = _mm256_and_si256(r9, r12); + r12 = _mm256_and_si256(r10, r12); + r12 = _mm256_and_si256(r11, r12); + + if (!_mm256_testz_si256(r12, r12)) { + if (last) { + _mm256_store_si256((__m256i *)(map + i), r12); + return i; + } + ret = 0; + } + _mm256_store_si256((__m256i *)(map + i), r12); + } + + return ret; +} + +/** + * avx2_lookup32() - AVX2-based lookup for packet fields of 32 four-bit groups + * @map: Previous matching bitmap that will be updated + * @lt: Lookup table for this field + * @pkt: Packet bytes to be matched + * @bsize: Bucket size for this lookup table, in bytes + * @first: If this is the first field in a set, start with all-ones bitmap + * @last: Last field: stop on the first match and return position + * + * Load buckets from lookup table corresponding to the values of each 4-bit + * group of packet bytes, and perform a bitwise intersection between them. If + * this is the first field in the set, simply AND the buckets together + * (equivalent to using an all-ones starting bitmap), use the provided starting + * bitmap otherwise. Then store the resulting match bitmap in @map. + * + * This implements steps 4.1 to 4.3 of the algorithm description, and is used + * for 128-bit fields (i.e. IPv6 addresses). + * + * Return: 32-byte rounded position of match, if last field, otherwise 0 on + * match and -1 on no match. + */ +__always_inline int avx2_lookup32(uint8_t *map, uint8_t *lt, uint8_t *pkt, + int bsize, int first, int last) +{ + __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r15; + int i, ret = -1; + + for (i = 0; i < bsize; i += 32) { + r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize); + r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize); + r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize); + r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize); + r4 = AVX_LOAD(lt + i, 4, pkt[2] >> 4, bsize); + r5 = AVX_LOAD(lt + i, 5, pkt[2] & 0x0f, bsize); + r6 = AVX_LOAD(lt + i, 6, pkt[3] >> 4, bsize); + r7 = AVX_LOAD(lt + i, 7, pkt[3] & 0x0f, bsize); + r8 = AVX_LOAD(lt + i, 8, pkt[4] >> 4, bsize); + r9 = AVX_LOAD(lt + i, 9, pkt[4] & 0x0f, bsize); + r10 = AVX_LOAD(lt + i, 10, pkt[5] >> 4, bsize); + r11 = AVX_LOAD(lt + i, 11, pkt[5] & 0x0f, bsize); + r12 = AVX_LOAD(lt + i, 12, pkt[6] >> 4, bsize); + r13 = AVX_LOAD(lt + i, 13, pkt[6] & 0x0f, bsize); + + if (first) { + r15 = _mm256_and_si256(r0, r1); + } else { + r15 = _mm256_stream_load_si256((__m256i *)(map + i)); + r15 = _mm256_and_si256(r0, r15); + r15 = _mm256_and_si256(r1, r15); + } + + r15 = _mm256_and_si256(r2, r15); + r15 = _mm256_and_si256(r3, r15); + r15 = _mm256_and_si256(r4, r15); + r15 = _mm256_and_si256(r5, r15); + r15 = _mm256_and_si256(r6, r15); + r15 = _mm256_and_si256(r7, r15); + r15 = _mm256_and_si256(r8, r15); + r15 = _mm256_and_si256(r9, r15); + r15 = _mm256_and_si256(r10, r15); + r15 = _mm256_and_si256(r11, r15); + r15 = _mm256_and_si256(r12, r15); + r15 = _mm256_and_si256(r13, r15); + + r0 = AVX_LOAD(lt + i, 14, pkt[7] >> 4, bsize); + r1 = AVX_LOAD(lt + i, 15, pkt[7] & 0x0f, bsize); + r2 = AVX_LOAD(lt + i, 16, pkt[8] >> 4, bsize); + r3 = AVX_LOAD(lt + i, 17, pkt[8] & 0x0f, bsize); + r4 = AVX_LOAD(lt + i, 18, pkt[9] >> 4, bsize); + r5 = AVX_LOAD(lt + i, 19, pkt[9] & 0x0f, bsize); + r6 = AVX_LOAD(lt + i, 20, pkt[10] >> 4, bsize); + r7 = AVX_LOAD(lt + i, 21, pkt[10] & 0x0f, bsize); + r8 = AVX_LOAD(lt + i, 22, pkt[11] >> 4, bsize); + r9 = AVX_LOAD(lt + i, 23, pkt[11] & 0x0f, bsize); + r10 = AVX_LOAD(lt + i, 24, pkt[12] >> 4, bsize); + r11 = AVX_LOAD(lt + i, 25, pkt[12] & 0x0f, bsize); + r12 = AVX_LOAD(lt + i, 26, pkt[13] >> 4, bsize); + r13 = AVX_LOAD(lt + i, 27, pkt[13] & 0x0f, bsize); + + r15 = _mm256_and_si256(r0, r15); + r15 = _mm256_and_si256(r1, r15); + r15 = _mm256_and_si256(r2, r15); + r15 = _mm256_and_si256(r3, r15); + r15 = _mm256_and_si256(r4, r15); + r15 = _mm256_and_si256(r5, r15); + r15 = _mm256_and_si256(r6, r15); + r15 = _mm256_and_si256(r7, r15); + r15 = _mm256_and_si256(r8, r15); + r15 = _mm256_and_si256(r9, r15); + r15 = _mm256_and_si256(r10, r15); + r15 = _mm256_and_si256(r11, r15); + r15 = _mm256_and_si256(r12, r15); + r15 = _mm256_and_si256(r13, r15); + + r0 = AVX_LOAD(lt + i, 28, pkt[14] >> 4, bsize); + r1 = AVX_LOAD(lt + i, 29, pkt[14] & 0x0f, bsize); + r2 = AVX_LOAD(lt + i, 30, pkt[15] >> 4, bsize); + r3 = AVX_LOAD(lt + i, 31, pkt[15] & 0x0f, bsize); + + r15 = _mm256_and_si256(r0, r15); + r15 = _mm256_and_si256(r1, r15); + r15 = _mm256_and_si256(r2, r15); + r15 = _mm256_and_si256(r3, r15); + + if (!_mm256_testz_si256(r15, r15)) { + if (last) { + _mm256_store_si256((__m256i *)(map + i), r15); + return i; + } + ret = 0; + } + _mm256_store_si256((__m256i *)(map + i), r15); + } + + return ret; +} + +/** + * avx2_lookup() - AVX2-based lookup for packet fields + * @map: Previous matching bitmap that will be updated + * @lt: Lookup table for this field + * @pkt: Packet bytes to be matched + * @bsize: Bucket size for this lookup table, in bytes + * @first: If this is the first field in a set, start with all-ones bitmap + * @last: If this is the last field in a set, stop on the first match + * + * This implements steps 4.1 to 4.3 of the algorithm description. + * + * Return: 32-byte rounded position of match, if last field, otherwise 0 on + * match and -1 on no match. + */ +__always_inline int avx2_lookup(uint8_t *map, uint8_t *lt, uint8_t *pkt, + int groups, int bsize, int first, int last) +{ + if (groups == 4) + return avx2_lookup4(map, lt, pkt, bsize, first, last); + if (groups == 8) + return avx2_lookup8(map, lt, pkt, bsize, first, last); + if (groups == 12) + return avx2_lookup12(map, lt, pkt, bsize, first, last); + if (groups == 32) + return avx2_lookup32(map, lt, pkt, bsize, first, last); + + return 0; +} |