summaryrefslogtreecommitdiff
path: root/pipapo.c
diff options
context:
space:
mode:
authorStefano Brivio <sbrivio@redhat.com>2019-11-19 00:18:17 (GMT)
committerStefano Brivio <sbrivio@redhat.com>2019-11-19 00:18:17 (GMT)
commita724e8dbd67ce3d9bf5a24bd836dea4ad3a5516f (patch)
tree8575f185b5f2e773a7334ffe1dd5891a70bb2151 /pipapo.c
pipapo: Initial importHEADmaster
Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
Diffstat (limited to 'pipapo.c')
-rw-r--r--pipapo.c1093
1 files changed, 1093 insertions, 0 deletions
diff --git a/pipapo.c b/pipapo.c
new file mode 100644
index 0000000..d44cb72
--- /dev/null
+++ b/pipapo.c
@@ -0,0 +1,1093 @@
+/* PIPAPO - PIle PAcket POlicies
+ *
+ * pipapo.c - Stand-alone algorithm implementation
+ *
+ * Author: Stefano Brivio <sbrivio@redhat.com>
+ * License: GPLv2
+ *
+ *
+ * == 1. Problem
+ *
+ * Match packets against n generic entries with m (ipset-like) fields in a set,
+ * mapping them to n keys, for example:
+ *
+ * --- fields --->
+ * | [net],[port],[net]... => [key]
+ * entries [net],[port],[net]... => [key]
+ * | [net],[port],[net]... => [key]
+ * V ...
+ *
+ * where [net] fields can be IP ranges or netmasks, and [port] fields are port
+ * ranges. Arbitrary packet fields (source and destination IPv4, IPv6, MAC
+ * addresses and ports) can be matched.
+ *
+ *
+ * == 2. Introduction
+ *
+ * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally
+ * relies on the consideration that every contiguous range in a space of b bits
+ * can be converted into b netmasks, from Theorem 3 in [Rottenstreich 2010].
+ * Despite the fact that we don't implement a boolean classifier here, some
+ * relevant optimisation ideas were inspired by [Kogan 2014], especially related
+ * to the notion of order-independent regions of a classifier.
+ *
+ * Classification against a number of entries, that require matching given bits
+ * of a packet field, is performed by grouping those bits in sets of arbitrary
+ * size, and classifying packet bits one group at a time.
+ *
+ * Example: to match the source port (16 bits) of a packet, we can divide
+ * those 16 bits in 4 groups of 4 bits each. Given the entry:
+ * 0000 0001 0101 1001
+ * and a packet with source port:
+ * 0000 0001 1010 1001
+ * first and second groups match, but the third doesn't. We conclude that the
+ * packet doesn't match the given entry.
+ *
+ * Translate the set to a sequence of lookup tables, one per field. Each table
+ * has two dimensions: bit groups to be matched for a single packet field, and
+ * all the possible values of said groups. Each group in the table maps to the
+ * next group through a set of rules, which are generated from entries during
+ * the pre-computation phase. The last group in a table maps to the next table,
+ * and the set of matched rules after the last table in the set is mapped to a
+ * single verdict key.
+ *
+ * To match, we perform table lookups using the values of relevant packet bits,
+ * divided according to the groups, and use a sequence of bitwise operations to
+ * progressively evaluate rule matching.
+ *
+ * In the description below, the basic idea is described first, and
+ * optimisations are illustrated later. Steps with O(n) computational complexity
+ * are then all reduced to O(log(n)) problems.
+ *
+ *
+ * == 3. Precomputation
+ *
+ * - For each packet field:
+ *
+ * (3.1) divide the b packet bits we want to classify into groups of size t,
+ * obtaining ceil(b / t) groups
+ *
+ * Example: match on destination IP address, with t = 4: 32 bits, 8 groups
+ * of 4 bits each
+ *
+ * (3.2) allocate a lookup table with one column ("bucket") for each possible
+ * value of a group, and with one row for each group
+ *
+ * Example: 8 groups, 2^4 buckets:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0
+ * 1
+ * 2
+ * 3
+ * 4
+ * 5
+ * 6
+ * 7
+ *
+ * (3.3) map the bits we want to classify for the current field, for a given
+ * entry, to a single rule for non-ranged and netmask set items, and to one or
+ * multiple rules for ranges. Ranges are expanded to composing netmasks:
+ *
+ * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048
+ * - rule #0: 10.0.0.5
+ * - rule #1: 192.168.1.0/24
+ * - rule #2: 192.168.2.0/31
+ *
+ * (3.4) insert references to the rules in the lookup table, selecting buckets
+ * according to bit values of a rule in the given group
+ *
+ * Example: given:
+ * - rule #0: 10.0.0.5 mapping to buckets
+ * < 0 10 0 0 0 0 0 5 >
+ * - rule #1: 192.168.1.0/24 mapping to buckets
+ * < 12 0 10 8 0 1 < 0..15 > < 0..15 > >
+ * - rule #2: 192.168.2.0/31 mapping to buckets
+ * < 12 0 10 8 0 2 0 < 0..1 > >
+ *
+ * these bits are set in the lookup table:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * (3.5) if this is not the last field in the set, build a mapping table that
+ * maps rules from the lookup table to rules belonging to the same entry in
+ * the next lookup table
+ *
+ * Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048
+ *
+ * given lookup table #0 (see example above):
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * and lookup table #1 with:
+ * - rule #0: 1024 mapping to buckets
+ * < 0 0 4 0 >
+ * - rule #1: 2048 mapping to buckets
+ * < 0 0 5 0 >
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024
+ * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1
+ * (rules #1, #2) to 2048 in lookup table #2 (rule #1):
+ *
+ * rules
+ * 0 1 2
+ * map to: 0 1 1
+ *
+ * (3.6) if this is the last field in the set, build a mapping table that maps
+ * rules from the last lookup table to verdict keys
+ *
+ * Example: 10.0.0.5:1024 gives 66 as verdict and
+ * 192.168.1.0-192.168.2.1:2048 gives 42. From the rules of lookup table #2
+ * as mapped above:
+ *
+ * rules
+ * 0 1
+ * map to: 66 42
+ *
+ *
+ * == 4. Matching
+ *
+ * (4.1) We use a result bitmap, with the size of a single lookup table bucket,
+ * to represent the matching state that applies at every algorithm step.
+ *
+ * - For each packet field:
+ *
+ * (4.2) start with an all-ones result bitmap
+ *
+ * (4.3) perform a lookup into the table corresponding to the current field,
+ * for each group, and at every group, AND the current result bitmap with the
+ * value from the lookup table bucket
+ *
+ * Example: 192.168.1.5 < 12 0 10 8 0 1 0 5 >, with lookup table from
+ * pre-computation example.
+ * Lookup table buckets are at least 3 bits wide, we'll arbitrarily assume
+ * they are 8 bits for convenience. Initial result bitmap is 0xff, the
+ * steps below show the value of the result bitmap after each group is
+ * processed:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ *
+ * result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6
+ *
+ * 1 1,2 0
+ *
+ * result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6
+ *
+ * 2 0 1,2
+ *
+ * result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6
+ *
+ * 3 0 1,2
+ *
+ * result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6
+ *
+ * 4 0,1,2
+ *
+ * result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6
+ *
+ * 5 0 1 2
+ *
+ * result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2
+ *
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ *
+ * result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2
+ *
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2
+ *
+ * (4.4) at the next field, start with a new, all-zeroes result bitmap. For
+ * each bit set in the previous result bitmap, OR the new result bitmap with
+ * the corresponding value from the mapping table for this field
+ *
+ * Example: with mapping table from pre-computation example, current result
+ * bitmap 0x02:
+ *
+ * rules
+ * 0 1 2
+ * map to: 0 1 1
+ *
+ * new result bitmap is 0x00 | 0x02 [bit/rule 1 was set] = 0x02
+ *
+ * we can now extend this example to cover the second iteration of the step
+ * above (lookup and AND bitmap): assuming the port field is
+ * 2048 < 0 0 5 0 >, with starting result bitmap 0x2, and lookup table
+ * for "port" field from pre-computation example:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5]
+ * & 0x3 [bucket 0], resulting bitmap is 0x2.
+ *
+ * (4.5) if this is the last field in the set, look up the value from the
+ * verdict table corresponding to the final result bitmap
+ *
+ * Example: 0x2 resulting bitmap from 192.168.1.5:2048, verdict table from
+ * pre-computation example:
+ *
+ * rules
+ * 0 1
+ * map to: 66 42
+ *
+ * the verdict is 42.
+ *
+ *
+ * == 5. Ideas for Future Optimisations
+ *
+ * (5.1) Reduction of number of rules by allowing inverted netmasks
+ *
+ * CAVEAT: This needs some complicated tracking for inverted entries, which
+ * aren't implemented here at all. In order to further halve the number of
+ * rules as described below, we also need to use sets of netmasks, instead of a
+ * single netmask for each entry, and intersect between them and their
+ * inversions. This also introduces further complexity.
+ *
+ * Theorem 3 in [Rottenstreich 2010] shows that every range of b bits can be
+ * converted into b bitmaps, with inversions, as also illustrated by section 9
+ * in [Kogan 2014].
+ *
+ * Further, if we allow partial subtractions between netmasks, for each range R
+ * of b bits, that can be converted into a b0 number of bitmaps, there exists a
+ * complementary range, R1, that can be converted into a b1 number of bitmaps,
+ * such that b1 + b0 = b.
+ *
+ * [TODO: Formal proof. This consideration is intuitively true and was validated
+ * by bruteforcing over the IPv4 address space, so we can safely use it in the
+ * implementation for the moment.]
+ *
+ * By construction, this allows us to express any range of b bits as ceil(b / 2)
+ * entries, either as reversed or direct entries. By allowing reversed entries,
+ * we need to also provide a reverse mapping possibility, which is achieved by
+ * modifying the mapping table from pre-computation step 4.5, and the step
+ * itself, as follows:
+ * - every bit corresponding to a reverse rule is set in the initial result
+ * bitmap, which is not necessarily an all-zeroes bitmap anymore
+ * - the transformation step becomes a XOR operation: the XOR step with the
+ * modified initial bitmap reverses the result, if we guarantee that one
+ * single XOR step is performed. This comes, in turn, from optimisation 5.2
+ *
+ * Example: match on 0.0.0.1-127.255.255.255:1024 and on 192.168.1.1:1024.
+ * A direct mapping means we need 31 rules for the first address range, using
+ * reversed rules we only need 2 rules instead (rule #0: 0.0.0.0/32,
+ * rule #1: 128.0.0.0/1). Rule #2: 192.168.1.1/32.
+ * First lookup table:
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1 1 1 1 1,2 1 1 1
+ * 1 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 2 0,1 1 1 1 1 1 1 1 1 1 1,2 1 1 1 1 1
+ * 3 0,1 1 1 1 1 1 1 1 1,2 1 1 1 1 1 1 1
+ * 4 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 5 0,1 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 0,1 1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ *
+ * Second lookup table: rule #0: 1024 for first entry, rule #1: 1024 for
+ * second entry
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0,1
+ * 3 0,1
+ *
+ * mapping table: initial bitmap is 0x1 (entry with rule #0 in second table
+ * is reversed in first table: set bit #0)
+ * rules
+ * 0 1 2
+ * XOR with: 0x1 0x1 0x2
+ *
+ * - expansion to reverse rules is selected if the number of reversed rules is
+ * strictly less than the number of needed direct rules.
+ *
+ * (5.2) Single step for match and fill from result bitmap
+ *
+ * CAVEAT: This significantly affects complexity of listing and deletion,
+ * because once rules are replaced, further steps are needed to rebuild the
+ * originating entries.
+ *
+ * Step 4.4 loops over all bits set in the current result bitmap: this is
+ * needed as set entries might overlap, for one or more fields. The
+ * computational complexity of mapping steps is O(m * log(n)), with m > 1
+ * overlapping factor, because we have no tight bounds on the number of
+ * overlapping entries. However, we can apply a reduction step on lookup tables,
+ * and ensure a single bit is set in the result bitmap after each lookup table
+ * is considered, with three observations:
+ *
+ * (5.2.1) In case two entries overlap entirely in a given lookup table, and
+ * their rules are present in the previous mapping table (or if it's the first
+ * lookup table), the entries can map to the same set of rules without loss of
+ * generality.
+ *
+ * Example: given entries 192.168.1.0/24:1024 and 192.168.1.0/24:2048, first
+ * lookup table initially has rules:
+ * #0: 192.168.1.0/24
+ * #1: 192.168.1.0/24
+ * and second lookup table has rules:
+ * #0: 1024
+ * #1: 2048
+ * with mapping table #0 -> #0 | #1 -> #1.
+ * After this reduction step, the first lookup table has a single
+ * rule (#0: 192.168.1.0/24), mapping to both rule #0 and #1.
+ *
+ * Example: given entries 192.168.1.0/24:1024:10.0.0.1 and
+ * 192.168.2.0/24:1024:10.0.0.1, we don't apply any reduction step.
+ * First lookup table has rules:
+ * #0: 192.168.1.0/24
+ * #1: 192.168.2.0/24
+ * and second lookup table has rules:
+ * #0: 1024
+ * #1: 1024
+ * with first mapping table #0 -> #0 | #1 -> #1. As there are no
+ * images in the first lookup table with both rules #0 and #1 set,
+ * these two rules will be preserved. Third lookup table has two
+ * rules:
+ * #0: 10.0.0.1/32
+ * #1: 10.0.0.1/32
+ * and second mapping table is: #0 -> #0 | #1 -> #1.
+ *
+ * (5.2.2) In case two entries overlap partially in a given lookup table, due to
+ * the way we expand ranges to netmasks, one set of rules is always a proper
+ * subset of the other overlapping one. That is, we can hit this situation:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 x,y
+ * 1 x,y x,y y y
+ *
+ * but not:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 x,y
+ * 1 x x,y y y
+ *
+ * so in case entries corresponding to x map to a proper subset of the y rules,
+ * x rules never occur without y rules. We can then replace the x,y pairs by
+ * x, and merge their mapping tables.
+ *
+ * Example: entries: 192.168.1.1-192.168.1.2:1
+ * expands to: 192.168.1.1/32:1 (#0:#0), 192.168.1.2/32:1 (#1:#0)
+ * 192.168.1.0-192.168.1.1:2
+ * expands to: 192.168.1.0/31:2 (#2:#1)
+ * 192.168.1.0-192.168.1.2:3
+ * expands to: 192.168.1.0/31:3 (#3:#2), 192.168.1.2/32:3 (#4:#2)
+ *
+ * First lookup table (only last group shown):
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * ...
+ * 7 2,3 0,2,3 1,4
+ *
+ * Second lookup table (only last group shown):
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * ...
+ * 3 0 1 2
+ *
+ * Initial mapping table: #0 -> #0 | #1 -> #0 | #2 -> #1 | #3 -> #2 | #4 -> #2
+ *
+ * Apply reduction step from 5.2.1 twice to first lookup table: rule #1
+ * entirely overlaps with #4, #2 entirely overlaps with #3. New lookup table:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * ...
+ * 7 2 0,2 1
+ *
+ * New mapping table: #0 -> #0 | #1 -> #0, #2 | #2 -> #1, #2
+ *
+ * Apply reduction step from 5.2.2: 0 never occurs without 2:
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * ...
+ * 7 2 0 1
+ *
+ * New mapping table: #0 -> #0, #1, #2 | #1 -> #0, #2 | #2 -> #1, #2
+ *
+ * (5.2.3) There are no other ways entries can overlap (directly from set
+ * theory)
+ * [TODO: Formal proof or reference.]
+ *
+ * (5.3) Single step for invert, match and bitmap fill operations (comes almost
+ * for free with 5.2 and 5.1).
+ *
+ * We can now replace the loop from 4.4 with a single XOR operation
+ * depending on the single bit set in the result bitmask after a lookup table is
+ * evaluated. Computational complexity of that step becomes, in theory, O(1):
+ * a single bit is set, hence lookup of the corresponding bitmap can be done in
+ * constant time. For all practical implementations, this is O(log(n)): the
+ * number of words in the mapping table grows linearly with the number of
+ * entries, but number of rules grows sublinearly and we can find the set bit
+ * with very efficient operations, too (e.g. x86's TZCNT, AVX-512 VPLZCNTQ,
+ * PPC's CNTLZ, etc.)]
+ *
+ * (5.4) [TODO: Evaluate if this provides an actual advantage even with an
+ * implementation of step 4.3 optionally based on AVX2 VPAND instrinsics.]
+ * As an alternative to finding the first bit set in the result bitmap for the
+ * mapping table lookup, we can terminate step 4.3 early, once we reach the last
+ * group of a table and a set bit is found in the result, and use the position
+ * of this bit directly for the mapping table lookup.
+ *
+ * (5.5) [TODO: Evaluate how bad this is on architectures where branching is
+ * expensive, e.g. PPC, find out if it makes sense to implement this optionally
+ * depending on architecture.]
+ * - We can terminate the algorithm early if the initial result bitmap, as we
+ * start processing the next lookup table, is empty: that means that the
+ * packet won't match any entry
+ * - We can terminate step 4.3 early if the result bitmap happens to be empty
+ * after a group is entirely evaluated: this means the result bitmap will be
+ * also empty at the beginning of step 4.4. We can't terminate the whole
+ * algorithm in this case, though, as entries might be represented with an
+ * inverted set of rules.
+ *
+ * (5.6) The mapping table described in step (3.5) grows in size linearly with
+ * the product of rules for the current field and rules for the next field. For
+ * entries with non-overlapping rules in a given field, this size would grow
+ * with the number of rules for this field, independently of the fact that these
+ * rules might map to the same set of rules in the next field.
+ *
+ * Example: one entry, 192.168.1.0-192.168.2.1:2048
+ *
+ * given lookup table #0 (see example above):
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0 1,2
+ * 1 1,2 0
+ * 2 0 1,2
+ * 3 0 1,2
+ * 4 0,1,2
+ * 5 0 1 2
+ * 6 0,1,2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
+ * 7 1,2 1,2 1 1 1 0,1 1 1 1 1 1 1 1 1 1 1
+ *
+ * and lookup table #1 with:
+ * - rule #0: 1024 mapping to buckets
+ * < 0 0 4 0 >
+ * - rule #1: 2048 mapping to buckets
+ * < 0 0 5 0 >
+ *
+ * bucket
+ * group 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ * 0 0,1
+ * 1 0,1
+ * 2 0 1
+ * 3 0,1
+ *
+ * we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024
+ * in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1
+ * (rules #1, #2) to 2048 in lookup table #2 (rule #1):
+ *
+ * rules
+ * 0 1 2
+ * map to: 0 1 1
+ *
+ *
+ * == 6. References
+ *
+ * [Ligatti 2010]
+ * A Packet-classification Algorithm for Arbitrary Bitmask Rules, with
+ * Automatic Time-space Tradeoffs
+ * Jay Ligatti, Josh Kuhn, and Chris Gage.
+ * Proceedings of the IEEE International Conference on Computer
+ * Communication Networks (ICCCN), August 2010.
+ * http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
+ *
+ * [Rottenstreich 2010]
+ * Worst-Case TCAM Rule Expansion
+ * Ori Rottenstreich and Isaac Keslassy.
+ * 2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010.
+ * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf
+ *
+ * [Kogan 2014]
+ * SAX-PAC (Scalable And eXpressive PAcket Classification)
+ * Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane,
+ * and Patrick Eugster.
+ * Proceedings of the 2014 ACM conference on SIGCOMM, August 2014.
+ * http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
+ */
+
+#ifndef VERBOSE
+#define PERF_TEST_BLOCK 100
+#endif
+
+#include <stdio.h>
+#include <arpa/inet.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <time.h>
+
+#include "pipapo.h"
+#include "set.h"
+#include "match.h"
+#include "util.h"
+
+/* Specifiers for set description: label, type, length, and offset in packet */
+struct desc_spec desc_specs[] = {
+ { "key", KEY, 0, 0 },
+ { "dmac", MAC, MAC_LEN, 0 },
+ { "smac", MAC, MAC_LEN, 6 },
+ { "saddr", ADDR, ADDR_LEN, 26 },
+ { "daddr", ADDR, ADDR_LEN, 30 },
+ { "sport", PORT, PORT_LEN, 34 },
+ { "dport", PORT, PORT_LEN, 36 },
+ { "saddr6", ADDR6, ADDR6_LEN, 22 },
+ { "daddr6", ADDR6, ADDR6_LEN, 38 },
+ { "sport6", PORT, PORT_LEN, 54 },
+ { "dport6", PORT, PORT_LEN, 56 },
+ { NULL, 0, 0, 0 },
+};
+
+/**
+ * desc_parse_spec() - Parse set layout from tokens on first line of set file
+ * @d: Pre-allocated set description to fill in
+ * @n: Number of field tokens
+ * @tokens: Labels of fields in desired set layout
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int desc_parse_spec(struct desc *d, int n, char (*tokens)[BUFSIZ])
+{
+ struct desc_spec *spec;
+ int i;
+
+ d->row_size = sizeof(enum set_ops);
+
+ for (i = 0; i < n; i++) {
+ for (spec = desc_specs; spec->label; spec++) {
+ if (!strncmp(tokens[i], spec->label, BUFSIZ))
+ break;
+ }
+
+ if (!spec->label) {
+ fprintf(stderr, "Invalid set specifier token %s\n",
+ tokens[i]);
+ return -EINVAL;
+ }
+
+ d->layout[i] = spec;
+
+ if (spec->type == KEY) {
+ d->row_size += sizeof(uint32_t);
+ continue;
+ }
+
+ d->fields++;
+ if (spec->type == ADDR)
+ d->row_size += sizeof(struct addr);
+ else if (spec->type == ADDR6)
+ d->row_size += sizeof(struct addr6);
+ else if (spec->type == PORT)
+ d->row_size += sizeof(struct port);
+ else if (spec->type == MAC)
+ d->row_size += sizeof(struct mac);
+ }
+
+ return 0;
+}
+
+/**
+ * desc_parse_entry() - Parse set entries from set file
+ * @d: Pre-allocated set description, d->data to be filled
+ * @at: Starting position in d->data to be filled with next set entry
+ * @n: Number of fields
+ * @tokens: Set items: addresses, ports, ranges thereof
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int desc_parse_entry(struct desc *d, int at, int n, char (*tokens)[BUFSIZ])
+{
+ int i, base = at * d->row_size, offset, b;
+ struct addr6 a6;
+ struct addr a;
+ struct port p;
+ char *c, *end;
+ struct mac m;
+ uint32_t key;
+ uint8_t v[6];
+
+ if (!strncmp(tokens[0], "list", strlen(tokens[0]))) {
+ *(enum set_ops *)(d->data + base) = LIST;
+ return 0;
+ }
+
+ if (!strncmp(tokens[0], "add", strlen(tokens[0])))
+ *(enum set_ops *)(d->data + base) = ADD;
+ else if (!strncmp(tokens[0], "delete", strlen(tokens[0])))
+ *(enum set_ops *)(d->data + base) = DEL;
+ else
+ return -EINVAL;
+ offset = sizeof(enum set_ops);
+ tokens++;
+ n--;
+
+ for (i = 0; i < n; i++) {
+ switch (d->layout[i]->type) {
+ case KEY:
+ key = strtoul(tokens[i], &end, 10);
+ if (*end)
+ return -EINVAL;
+
+ memcpy(d->data + base + offset, &key, sizeof(key));
+ offset += sizeof(key);
+ break;
+ case ADDR:
+ /* IPv4... */
+ if ((c = strchr(tokens[i], '-'))) {
+ /* range */
+ if (!inet_aton(c + 1, (struct in_addr *)&a.end))
+ return -EINVAL;
+ a.cidr = 0;
+ *c = 0;
+ } else if ((c = strchr(tokens[i], '/'))) {
+ /* netmask */
+ *c = 0;
+ a.cidr = strtoul(c + 1, &end, 10);
+ if (*end)
+ return -EINVAL;
+ } else {
+ /* address */
+ a.cidr = 32;
+ }
+
+ if (!inet_aton(tokens[i], (struct in_addr *)&a.start))
+ return -EINVAL;
+
+ memcpy(d->data + base + offset, &a, sizeof(a));
+ offset += sizeof(a);
+ break;
+ case ADDR6:
+ /* IPv6... */
+ if ((c = strchr(tokens[i], '-'))) {
+ /* range */
+ if (!inet_pton(AF_INET6, c + 1, &a6.end))
+ return -EINVAL;
+ a6.cidr = 0;
+ *c = 0;
+ } else if ((c = strchr(tokens[i], '/'))) {
+ /* netmask */
+ *c = 0;
+ a6.cidr = strtoul(c + 1, &end, 10);
+ if (*end)
+ return -EINVAL;
+ } else {
+ /* address */
+ a6.cidr = 128;
+ }
+
+ if (!inet_pton(AF_INET6, tokens[i], &a6.start))
+ return -EINVAL;
+
+ memcpy(d->data + base + offset, &a6, sizeof(a6));
+ offset += sizeof(a6);
+ break;
+ case PORT:
+ if ((c = strchr(tokens[i], '-'))) {
+ *c = 0;
+ p.end = htons(strtoul(c + 1, &end, 10));
+ if (*end)
+ return -EINVAL;
+ } else {
+ p.end = 0;
+ }
+
+ p.start = htons(strtoul(tokens[i], &end, 10));
+ if (*end)
+ return -EINVAL;
+
+ memcpy(d->data + base + offset, &p, sizeof(p));
+ offset += sizeof(p);
+ break;
+ case MAC:
+ if ((c = strchr(tokens[i], '-')))
+ *c = 0;
+
+ b = sscanf(tokens[i],
+ "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx%*c",
+ v + 0, v + 1, v + 2, v + 3, v + 4, v + 5);
+ if (b != 6)
+ return -EINVAL;
+ memcpy(&m.start, v, 6);
+
+ if (c) {
+ b = sscanf(c + 1,
+ "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx%*c",
+ v + 0, v + 1, v + 2,
+ v + 3, v + 4, v + 5);
+ if (b != 6)
+ return -EINVAL;
+ memcpy(&m.end, v, 6);
+ } else {
+ memset(&m.end, 0, 6);
+ }
+
+ memcpy(d->data + base + offset, &m, sizeof(m));
+ offset += sizeof(m);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * usage() - Display usage and exit
+ * @argv0: Executable name
+ */
+void usage(const char *argv0)
+{
+ fprintf(stderr, "%s <ruleset> <packets>\n", argv0);
+ exit(1);
+}
+
+/**
+ * packet_parse() - Parse packets and expected key from path into binary format
+ * @path: Path to packet file
+ * @ret: Pointer to packet buffer, incremented by this function
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+static int packet_parse(const char *path, uint8_t **ret)
+{
+ int alloc, n, byte_count, packet_count = 0, err;
+ uint8_t *packets, *pos, *tmp;
+ char buf[BUFSIZ], *end;
+ char *p;
+ FILE *f;
+
+ alloc = BUFSIZ;
+ pos = packets = malloc(alloc * (sizeof(uint32_t) + PACKET_SIZE));
+ if (!packets)
+ return -ENOMEM;
+
+ f = fopen(path, "r");
+ if (!f) {
+ err = -errno;
+ goto fail;
+ }
+
+ while (fgets(buf, BUFSIZ, f)) {
+ if (!*buf || *buf == '#' || *buf == ' ' || *buf == '\n')
+ continue;
+
+ if (++packet_count > alloc) {
+ alloc += BUFSIZ;
+ tmp = realloc(packets,
+ alloc * (sizeof(uint32_t) + PACKET_SIZE));
+ if (!tmp) {
+ err = -ENOMEM;
+ goto fail_close;
+ }
+ packets = tmp;
+ pos = packets;
+ pos += (packet_count - 1) *
+ (sizeof(uint32_t) + PACKET_SIZE);
+ }
+
+ /* Key */
+ p = strtok(buf, " \t");
+ n = strtoul(p, &end, 0);
+ if (*end) {
+ err = -EINVAL;
+ goto fail_close;
+ }
+ *(uint32_t *)pos = n;
+ pos += sizeof(uint32_t);
+
+ /* Packet bytes */
+ byte_count = 0;
+ while ((p = strtok(NULL, " \t\n"))) {
+ n = strtoul(p, &end, 0);
+ if (*end) {
+ err = -EINVAL;
+ goto fail_close;
+ }
+ *pos = n;
+ pos++;
+ if (++byte_count >= PACKET_SIZE)
+ break;
+ }
+ if (byte_count < PACKET_SIZE) {
+ memset(pos, 0, PACKET_SIZE - byte_count);
+ pos += PACKET_SIZE - byte_count;
+ }
+ }
+ fclose(f);
+
+ *ret = packets;
+ verbose("Read %i packets from %s\n", packet_count, path);
+
+ return packet_count;
+
+fail_close:
+ fclose(f);
+fail:
+ free(packets);
+ return err;
+}
+
+/**
+ * desc_parse() - Parse set file from path
+ * @path: Path to set file
+ * @d: Pre-allocated set, with description and data parts
+ *
+ * Return: 0 on success, negative error code on failure
+ */
+int desc_parse(const char *path, struct desc *d)
+{
+ int first = 1, err = 0, alloc = 0, n;
+ char t[9][BUFSIZ];
+ char buf[BUFSIZ];
+ uint8_t *tmp;
+ FILE *f;
+
+ f = fopen(path, "r");
+ if (!f)
+ return -errno;
+
+ while (fgets(buf, BUFSIZ, f)) {
+ n = sscanf(buf, "%s %s %s %s %s %s %s %s %s", t[0],
+ t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8]);
+ if (!n || *t[0] == '#')
+ continue;
+
+ if (first) {
+ /* First line is the set type specifier */
+ err = desc_parse_spec(d, n, t);
+ if (err)
+ goto out;
+
+ first = 0;
+ continue;
+ }
+
+ if (d->entries == alloc) {
+ alloc += BUFSIZ;
+ tmp = realloc(d->data, alloc * d->row_size);
+ if (!tmp) {
+ err = -ENOMEM;
+ goto out;
+ }
+ d->data = tmp;
+ }
+
+ err = desc_parse_entry(d, d->entries++, n, t);
+ }
+
+out:
+ fclose(f);
+ return err;
+}
+
+/**
+ * main() - Entry point: parse set and packets, pre-compute, match
+ * @argc: Argument count
+ * @argv: Path to set description and packet files
+ *
+ * Return: 0 on success, 1 on matching failure, negative error code otherwise
+ */
+int main(int argc, char **argv)
+{
+ int packet_count, offset = 0, i, err;
+ uint8_t *packets, *ptr;
+ struct desc d = { 0 };
+ struct kernel_set *ks;
+ struct set s = { 0 };
+ uint32_t key;
+#ifdef VERBOSE
+ char *total_mem_suffix = "B";
+ int total_mem = 0;
+ struct field *f;
+#else
+ struct timespec tp_start, tp_end;
+ int repeat = 0, b = 0;
+ double time = 0;
+#endif
+ (void)key;
+
+ if (argc != 3)
+ usage(argv[0]);
+
+ verbose("=== Parsing\n");
+
+ if ((packet_count = packet_parse(argv[2], &packets)) < 0) {
+ fprintf(stderr, "Parsing failed, %s\n", strerror(packet_count));
+ exit(packet_count);
+ }
+
+ if ((err = desc_parse(argv[1], &d))) {
+ fprintf(stderr, "Parsing failed, %s\n", strerror(err));
+ return err;
+ }
+ verbose("\n");
+
+ verbose("=== Pre-computation\n");
+
+ if ((err = init(&s, d.layout))) {
+ fprintf(stderr, "Set init failed, %s\n", strerror(err));
+ return err;
+ }
+
+ for (i = 0; i < d.entries; i++) {
+ if (*(enum set_ops *)(d.data + offset) == ADD)
+ err = add(&s, d.layout,
+ d.data + offset + sizeof(enum set_ops));
+ else if (*(enum set_ops *)(d.data + offset) == LIST)
+ list_or_del(&s, d.layout, NULL);
+ else if (*(enum set_ops *)(d.data + offset) == DEL)
+ err = list_or_del(&s, d.layout,
+ d.data + offset +
+ sizeof(enum set_ops));
+
+ if (err)
+ return err;
+ offset += d.row_size;
+ }
+
+ for (i = 0; d.layout[i]->type != KEY; i++) {
+ verbose("\n");
+ verbose("Lookup table for %s:\n", d.layout[i]->label);
+ show_lookup(&s.fields[i]);
+ verbose("Mapping table for %s:\n", d.layout[i]->label);
+ show_mapping(&s.fields[i], d.layout[i + 1]->type == KEY);
+ }
+ verbose("\n");
+
+ verbose("=== Match\n");
+
+ ks = kernel_init(&s, d.layout);
+
+#ifndef VERBOSE
+repeat_block:
+ repeat = PERF_TEST_BLOCK;
+ b++;
+ clock_gettime(CLOCK_MONOTONIC, &tp_start);
+ do {
+#endif
+ ptr = packets;
+ for (i = 0; i < packet_count; i++) {
+ key = match(ks, ptr + sizeof(uint32_t));
+#ifdef VERBOSE
+ verbose("Packet #%i => key: %u\n\n", i, key);
+ if (key != *(uint32_t *)ptr) {
+ verbose("FAIL: expected key: %u\n",
+ *(uint32_t *)ptr);
+ return 1;
+ }
+#endif
+ ptr += PACKET_SIZE + sizeof(uint32_t);
+ }
+#ifndef VERBOSE
+ } while (repeat--);
+ clock_gettime(CLOCK_MONOTONIC, &tp_end);
+ time += (tp_end.tv_sec - tp_start.tv_sec) +
+ (tp_end.tv_nsec - tp_start.tv_nsec) / 1E9;
+ if (time < 5) {
+ goto repeat_block;
+ }
+
+ fprintf(stdout, "Matched %lfM packets in %lfs (%lf Mpps)\n",
+ PERF_TEST_BLOCK * b * packet_count / 1000.0 / 1000,
+ time,
+ PERF_TEST_BLOCK * b * packet_count / 1000.0 / 1000 / time);
+#endif
+
+#ifdef VERBOSE
+ verbose("\n");
+
+ verbose("\n=== Memory Usage\n");
+ for_each_field(f, &s, d.layout) {
+ char *lt_suffix = "B", *mt_suffix = "B", *field_suffix = "B";
+ int lt_size = f->groups * BUCKETS * f->bsize;
+ int mt_size = f->rules * sizeof(*f->mt);
+ int field_size;
+
+ total_mem += field_size = lt_size + mt_size + sizeof(*f);
+
+ if (lt_size > 20 * 1024) {
+ lt_size /= 1024;
+ lt_suffix = "KiB";
+ }
+
+ if (mt_size > 20 * 1024) {
+ mt_size /= 1024;
+ mt_suffix = "KiB";
+ }
+
+ if (field_size > 20 * 1024) {
+ field_size /= 1024;
+ field_suffix = "KiB";
+ }
+
+ verbose("Field %i, %s, %i rules:\n", i, d.layout[i]->label,
+ f->rules);
+ verbose(" lookup table: %i%s\n", lt_size, lt_suffix);
+ free(f->lt);
+ verbose(" mapping table: %i%s\n", mt_size, mt_suffix);
+ free(f->mt);
+ verbose(" total: %i%s\n", field_size, field_suffix);
+ verbose("\n");
+ }
+
+ if (total_mem > 50 * 1024 * 1024) {
+ total_mem /= 1024 * 1024;
+ total_mem_suffix = "MiB";
+ } else if (total_mem > 20 * 1024) {
+ total_mem /= 1024;
+ total_mem_suffix = "KiB";
+ }
+ verbose("Total: %i%s\n", total_mem, total_mem_suffix);
+#endif
+
+ /* Rest of allocated data is volatile */
+ free(s.fields);
+ free(d.data);
+ free(packets);
+ free(ks->map[0]);
+ free(ks->map[1]);
+ for (i = 0; i < 8; i++) {
+ free(ks->lt[i]);
+ free(ks->mt[i]);
+ }
+ free(ks);
+
+ return 0;
+}