summaryrefslogtreecommitdiff
path: root/avx2.h
diff options
context:
space:
mode:
Diffstat (limited to 'avx2.h')
-rw-r--r--avx2.h355
1 files changed, 355 insertions, 0 deletions
diff --git a/avx2.h b/avx2.h
new file mode 100644
index 0000000..fca4c60
--- /dev/null
+++ b/avx2.h
@@ -0,0 +1,355 @@
+/* PIPAPO - PIle PAcket POlicies
+ *
+ * avx2.h - Lookup routines based on AVX2 intrinsics
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ * License: GPLv2
+ */
+
+#include <immintrin.h>
+#include <stdint.h>
+
+#define AVX_LOAD(lt, g, v, bsize) \
+ _mm256_stream_load_si256((__m256i *)(lt + (g * BUCKETS + (v)) * bsize));
+
+/**
+ * avx2_lookup4() - AVX2-based lookup for packet fields of 4 four-bit groups
+ * @map: Previous matching bitmap that will be updated
+ * @lt: Lookup table for this field
+ * @pkt: Packet bytes to be matched
+ * @bsize: Bucket size for this lookup table, in bytes
+ * @first: If this is the first field in a set, start with all-ones bitmap
+ * @last: Last field: stop on the first match and return position
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then store the resulting match bitmap in @map.
+ *
+ * This implements steps 4.1 to 4.3 of the algorithm description, and is used
+ * for 16-bit fields (i.e. ports).
+ *
+ * Return: 32-byte rounded position of match, if last field, otherwise 0 on
+ * match and -1 on no match.
+ */
+__always_inline int avx2_lookup4(uint8_t *map, uint8_t *lt, uint8_t *pkt,
+ int bsize, int first, int last)
+{
+ __m256i r0, r1, r2, r3, r4;
+ int i, ret = -1;
+
+ for (i = 0; i < bsize; i += 32) {
+ r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize);
+ r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize);
+ r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize);
+ r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize);
+
+ if (first) {
+ r4 = _mm256_and_si256(r1, r0);
+ } else {
+ r4 = _mm256_stream_load_si256((__m256i *)(map + i));
+
+ r4 = _mm256_and_si256(r0, r4);
+ r4 = _mm256_and_si256(r1, r4);
+ }
+ r4 = _mm256_and_si256(r2, r4);
+ r4 = _mm256_and_si256(r3, r4);
+
+ if (!_mm256_testz_si256(r4, r4)) {
+ if (last) {
+ _mm256_store_si256((__m256i *)(map + i), r4);
+ return i;
+ }
+ ret = 0;
+ }
+ _mm256_store_si256((__m256i *)(map + i), r4);
+ }
+
+ return ret;
+}
+
+/**
+ * avx2_lookup8() - AVX2-based lookup for packet fields of 8 four-bit groups
+ * @map: Previous matching bitmap that will be updated
+ * @lt: Lookup table for this field
+ * @pkt: Packet bytes to be matched
+ * @bsize: Bucket size for this lookup table, in bytes
+ * @first: If this is the first field in a set, start with all-ones bitmap
+ * @last: Last field: stop on the first match and return position
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then store the resulting match bitmap in @map.
+ *
+ * This implements steps 4.1 to 4.3 of the algorithm description, and is used
+ * for 32-bit fields (i.e. IPv4 addresses).
+ *
+ * Return: 32-byte rounded position of match, if last field, otherwise 0 on
+ * match and -1 on no match.
+ */
+__always_inline int avx2_lookup8(uint8_t *map, uint8_t *lt, uint8_t *pkt,
+ int bsize, int first, int last)
+{
+ __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8;
+ int i, ret = -1;
+
+ for (i = 0; i < bsize; i += 32) {
+ r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize);
+ r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize);
+ r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize);
+ r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize);
+ r4 = AVX_LOAD(lt + i, 4, pkt[2] >> 4, bsize);
+ r5 = AVX_LOAD(lt + i, 5, pkt[2] & 0x0f, bsize);
+ r6 = AVX_LOAD(lt + i, 6, pkt[3] >> 4, bsize);
+ r7 = AVX_LOAD(lt + i, 7, pkt[3] & 0x0f, bsize);
+
+ if (first) {
+ r8 = _mm256_and_si256(r1, r0);
+ } else {
+ r8 = _mm256_stream_load_si256((__m256i *)(map + i));
+ r8 = _mm256_and_si256(r0, r8);
+ r8 = _mm256_and_si256(r1, r8);
+ }
+ r8 = _mm256_and_si256(r2, r8);
+ r8 = _mm256_and_si256(r3, r8);
+ r8 = _mm256_and_si256(r4, r8);
+ r8 = _mm256_and_si256(r5, r8);
+ r8 = _mm256_and_si256(r6, r8);
+ r8 = _mm256_and_si256(r7, r8);
+
+ if (!_mm256_testz_si256(r8, r8)) {
+ if (last) {
+ _mm256_store_si256((__m256i *)(map + i), r8);
+ return i;
+ }
+ ret = 0;
+ }
+ _mm256_store_si256((__m256i *)(map + i), r8);
+ }
+
+ return ret;
+}
+
+/**
+ * avx2_lookup12() - AVX2-based lookup for packet fields of 12 four-bit groups
+ * @map: Previous matching bitmap that will be updated
+ * @lt: Lookup table for this field
+ * @pkt: Packet bytes to be matched
+ * @bsize: Bucket size for this lookup table, in bytes
+ * @first: If this is the first field in a set, start with all-ones bitmap
+ * @last: Last field: stop on the first match and return position
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then store the resulting match bitmap in @map.
+ *
+ * This implements steps 4.1 to 4.3 of the algorithm description, and is used
+ * for 48-bit fields (i.e. MAC addresses).
+ *
+ * Return: 32-byte rounded position of match, if last field, otherwise 0 on
+ * match and -1 on no match.
+ */
+__always_inline int avx2_lookup12(uint8_t *map, uint8_t *lt, uint8_t *pkt,
+ int bsize, int first, int last)
+{
+ __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12;
+ int i, ret = -1;
+
+ for (i = 0; i < bsize; i += 32) {
+ r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize);
+ r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize);
+ r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize);
+ r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize);
+ r4 = AVX_LOAD(lt + i, 4, pkt[2] >> 4, bsize);
+ r5 = AVX_LOAD(lt + i, 5, pkt[2] & 0x0f, bsize);
+ r6 = AVX_LOAD(lt + i, 6, pkt[3] >> 4, bsize);
+ r7 = AVX_LOAD(lt + i, 7, pkt[3] & 0x0f, bsize);
+ r8 = AVX_LOAD(lt + i, 8, pkt[4] >> 4, bsize);
+ r9 = AVX_LOAD(lt + i, 9, pkt[4] & 0x0f, bsize);
+ r10 = AVX_LOAD(lt + i, 10, pkt[5] >> 4, bsize);
+ r11 = AVX_LOAD(lt + i, 11, pkt[5] & 0x0f, bsize);
+
+ if (first) {
+ r12 = _mm256_and_si256(r0, r1);
+ } else {
+ r12 = _mm256_stream_load_si256((__m256i *)(map + i));
+ r12 = _mm256_and_si256(r0, r12);
+ r12 = _mm256_and_si256(r1, r12);
+ }
+
+ r12 = _mm256_and_si256(r0, r12);
+ r12 = _mm256_and_si256(r1, r12);
+ r12 = _mm256_and_si256(r2, r12);
+ r12 = _mm256_and_si256(r3, r12);
+ r12 = _mm256_and_si256(r4, r12);
+ r12 = _mm256_and_si256(r5, r12);
+ r12 = _mm256_and_si256(r6, r12);
+ r12 = _mm256_and_si256(r7, r12);
+ r12 = _mm256_and_si256(r8, r12);
+ r12 = _mm256_and_si256(r9, r12);
+ r12 = _mm256_and_si256(r10, r12);
+ r12 = _mm256_and_si256(r11, r12);
+
+ if (!_mm256_testz_si256(r12, r12)) {
+ if (last) {
+ _mm256_store_si256((__m256i *)(map + i), r12);
+ return i;
+ }
+ ret = 0;
+ }
+ _mm256_store_si256((__m256i *)(map + i), r12);
+ }
+
+ return ret;
+}
+
+/**
+ * avx2_lookup32() - AVX2-based lookup for packet fields of 32 four-bit groups
+ * @map: Previous matching bitmap that will be updated
+ * @lt: Lookup table for this field
+ * @pkt: Packet bytes to be matched
+ * @bsize: Bucket size for this lookup table, in bytes
+ * @first: If this is the first field in a set, start with all-ones bitmap
+ * @last: Last field: stop on the first match and return position
+ *
+ * Load buckets from lookup table corresponding to the values of each 4-bit
+ * group of packet bytes, and perform a bitwise intersection between them. If
+ * this is the first field in the set, simply AND the buckets together
+ * (equivalent to using an all-ones starting bitmap), use the provided starting
+ * bitmap otherwise. Then store the resulting match bitmap in @map.
+ *
+ * This implements steps 4.1 to 4.3 of the algorithm description, and is used
+ * for 128-bit fields (i.e. IPv6 addresses).
+ *
+ * Return: 32-byte rounded position of match, if last field, otherwise 0 on
+ * match and -1 on no match.
+ */
+__always_inline int avx2_lookup32(uint8_t *map, uint8_t *lt, uint8_t *pkt,
+ int bsize, int first, int last)
+{
+ __m256i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r15;
+ int i, ret = -1;
+
+ for (i = 0; i < bsize; i += 32) {
+ r0 = AVX_LOAD(lt + i, 0, pkt[0] >> 4, bsize);
+ r1 = AVX_LOAD(lt + i, 1, pkt[0] & 0x0f, bsize);
+ r2 = AVX_LOAD(lt + i, 2, pkt[1] >> 4, bsize);
+ r3 = AVX_LOAD(lt + i, 3, pkt[1] & 0x0f, bsize);
+ r4 = AVX_LOAD(lt + i, 4, pkt[2] >> 4, bsize);
+ r5 = AVX_LOAD(lt + i, 5, pkt[2] & 0x0f, bsize);
+ r6 = AVX_LOAD(lt + i, 6, pkt[3] >> 4, bsize);
+ r7 = AVX_LOAD(lt + i, 7, pkt[3] & 0x0f, bsize);
+ r8 = AVX_LOAD(lt + i, 8, pkt[4] >> 4, bsize);
+ r9 = AVX_LOAD(lt + i, 9, pkt[4] & 0x0f, bsize);
+ r10 = AVX_LOAD(lt + i, 10, pkt[5] >> 4, bsize);
+ r11 = AVX_LOAD(lt + i, 11, pkt[5] & 0x0f, bsize);
+ r12 = AVX_LOAD(lt + i, 12, pkt[6] >> 4, bsize);
+ r13 = AVX_LOAD(lt + i, 13, pkt[6] & 0x0f, bsize);
+
+ if (first) {
+ r15 = _mm256_and_si256(r0, r1);
+ } else {
+ r15 = _mm256_stream_load_si256((__m256i *)(map + i));
+ r15 = _mm256_and_si256(r0, r15);
+ r15 = _mm256_and_si256(r1, r15);
+ }
+
+ r15 = _mm256_and_si256(r2, r15);
+ r15 = _mm256_and_si256(r3, r15);
+ r15 = _mm256_and_si256(r4, r15);
+ r15 = _mm256_and_si256(r5, r15);
+ r15 = _mm256_and_si256(r6, r15);
+ r15 = _mm256_and_si256(r7, r15);
+ r15 = _mm256_and_si256(r8, r15);
+ r15 = _mm256_and_si256(r9, r15);
+ r15 = _mm256_and_si256(r10, r15);
+ r15 = _mm256_and_si256(r11, r15);
+ r15 = _mm256_and_si256(r12, r15);
+ r15 = _mm256_and_si256(r13, r15);
+
+ r0 = AVX_LOAD(lt + i, 14, pkt[7] >> 4, bsize);
+ r1 = AVX_LOAD(lt + i, 15, pkt[7] & 0x0f, bsize);
+ r2 = AVX_LOAD(lt + i, 16, pkt[8] >> 4, bsize);
+ r3 = AVX_LOAD(lt + i, 17, pkt[8] & 0x0f, bsize);
+ r4 = AVX_LOAD(lt + i, 18, pkt[9] >> 4, bsize);
+ r5 = AVX_LOAD(lt + i, 19, pkt[9] & 0x0f, bsize);
+ r6 = AVX_LOAD(lt + i, 20, pkt[10] >> 4, bsize);
+ r7 = AVX_LOAD(lt + i, 21, pkt[10] & 0x0f, bsize);
+ r8 = AVX_LOAD(lt + i, 22, pkt[11] >> 4, bsize);
+ r9 = AVX_LOAD(lt + i, 23, pkt[11] & 0x0f, bsize);
+ r10 = AVX_LOAD(lt + i, 24, pkt[12] >> 4, bsize);
+ r11 = AVX_LOAD(lt + i, 25, pkt[12] & 0x0f, bsize);
+ r12 = AVX_LOAD(lt + i, 26, pkt[13] >> 4, bsize);
+ r13 = AVX_LOAD(lt + i, 27, pkt[13] & 0x0f, bsize);
+
+ r15 = _mm256_and_si256(r0, r15);
+ r15 = _mm256_and_si256(r1, r15);
+ r15 = _mm256_and_si256(r2, r15);
+ r15 = _mm256_and_si256(r3, r15);
+ r15 = _mm256_and_si256(r4, r15);
+ r15 = _mm256_and_si256(r5, r15);
+ r15 = _mm256_and_si256(r6, r15);
+ r15 = _mm256_and_si256(r7, r15);
+ r15 = _mm256_and_si256(r8, r15);
+ r15 = _mm256_and_si256(r9, r15);
+ r15 = _mm256_and_si256(r10, r15);
+ r15 = _mm256_and_si256(r11, r15);
+ r15 = _mm256_and_si256(r12, r15);
+ r15 = _mm256_and_si256(r13, r15);
+
+ r0 = AVX_LOAD(lt + i, 28, pkt[14] >> 4, bsize);
+ r1 = AVX_LOAD(lt + i, 29, pkt[14] & 0x0f, bsize);
+ r2 = AVX_LOAD(lt + i, 30, pkt[15] >> 4, bsize);
+ r3 = AVX_LOAD(lt + i, 31, pkt[15] & 0x0f, bsize);
+
+ r15 = _mm256_and_si256(r0, r15);
+ r15 = _mm256_and_si256(r1, r15);
+ r15 = _mm256_and_si256(r2, r15);
+ r15 = _mm256_and_si256(r3, r15);
+
+ if (!_mm256_testz_si256(r15, r15)) {
+ if (last) {
+ _mm256_store_si256((__m256i *)(map + i), r15);
+ return i;
+ }
+ ret = 0;
+ }
+ _mm256_store_si256((__m256i *)(map + i), r15);
+ }
+
+ return ret;
+}
+
+/**
+ * avx2_lookup() - AVX2-based lookup for packet fields
+ * @map: Previous matching bitmap that will be updated
+ * @lt: Lookup table for this field
+ * @pkt: Packet bytes to be matched
+ * @bsize: Bucket size for this lookup table, in bytes
+ * @first: If this is the first field in a set, start with all-ones bitmap
+ * @last: If this is the last field in a set, stop on the first match
+ *
+ * This implements steps 4.1 to 4.3 of the algorithm description.
+ *
+ * Return: 32-byte rounded position of match, if last field, otherwise 0 on
+ * match and -1 on no match.
+ */
+__always_inline int avx2_lookup(uint8_t *map, uint8_t *lt, uint8_t *pkt,
+ int groups, int bsize, int first, int last)
+{
+ if (groups == 4)
+ return avx2_lookup4(map, lt, pkt, bsize, first, last);
+ if (groups == 8)
+ return avx2_lookup8(map, lt, pkt, bsize, first, last);
+ if (groups == 12)
+ return avx2_lookup12(map, lt, pkt, bsize, first, last);
+ if (groups == 32)
+ return avx2_lookup32(map, lt, pkt, bsize, first, last);
+
+ return 0;
+}