summaryrefslogtreecommitdiff
path: root/pipapo.c
blob: d44cb725c2a73593eba3fd22b634748e93e71af1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
/* PIPAPO - PIle PAcket POlicies
 *
 * pipapo.c - Stand-alone algorithm implementation
 *
 * Author: Stefano Brivio <sbrivio@redhat.com>
 * License: GPLv2
 *
 *
 * == 1. Problem
 *
 * Match packets against n generic entries with m (ipset-like) fields in a set,
 * mapping them to n keys, for example:
 *
 *             --- fields --->
 *   |     [net],[port],[net]... => [key]
 * entries [net],[port],[net]... => [key]
 *   |     [net],[port],[net]... => [key]
 *   V     ...
 *
 * where [net] fields can be IP ranges or netmasks, and [port] fields are port
 * ranges. Arbitrary packet fields (source and destination IPv4, IPv6, MAC
 * addresses and ports) can be matched.
 *
 *
 * == 2. Introduction
 *
 * This algorithm is loosely inspired by [Ligatti 2010], and fundamentally
 * relies on the consideration that every contiguous range in a space of b bits
 * can be converted into b netmasks, from Theorem 3 in [Rottenstreich 2010].
 * Despite the fact that we don't implement a boolean classifier here, some
 * relevant optimisation ideas were inspired by [Kogan 2014], especially related
 * to the notion of order-independent regions of a classifier.
 *
 * Classification against a number of entries, that require matching given bits
 * of a packet field, is performed by grouping those bits in sets of arbitrary
 * size, and classifying packet bits one group at a time.
 *
 *   Example: to match the source port (16 bits) of a packet, we can divide
 *   those 16 bits in 4 groups of 4 bits each. Given the entry:
 *      0000 0001 0101 1001
 *   and a packet with source port:
 *      0000 0001 1010 1001
 *   first and second groups match, but the third doesn't. We conclude that the
 *   packet doesn't match the given entry.
 *
 * Translate the set to a sequence of lookup tables, one per field. Each table
 * has two dimensions: bit groups to be matched for a single packet field, and
 * all the possible values of said groups. Each group in the table maps to the
 * next group through a set of rules, which are generated from entries during
 * the pre-computation phase. The last group in a table maps to the next table,
 * and the set of matched rules after the last table in the set is mapped to a
 * single verdict key.
 *
 * To match, we perform table lookups using the values of relevant packet bits,
 * divided according to the groups, and use a sequence of bitwise operations to
 * progressively evaluate rule matching.
 *
 * In the description below, the basic idea is described first, and
 * optimisations are illustrated later. Steps with O(n) computational complexity
 * are then all reduced to O(log(n)) problems.
 *
 *
 * == 3. Precomputation
 *
 * - For each packet field:
 *
 *   (3.1) divide the b packet bits we want to classify into groups of size t,
 *   obtaining ceil(b / t) groups
 *
 *      Example: match on destination IP address, with t = 4: 32 bits, 8 groups
 *      of 4 bits each
 *
 *   (3.2) allocate a lookup table with one column ("bucket") for each possible
 *   value of a group, and with one row for each group
 *
 *      Example: 8 groups, 2^4 buckets:
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0
 *        1
 *        2
 *        3
 *        4
 *        5
 *        6
 *        7
 *
 *   (3.3) map the bits we want to classify for the current field, for a given
 *   entry, to a single rule for non-ranged and netmask set items, and to one or
 *   multiple rules for ranges. Ranges are expanded to composing netmasks:
 *
 *      Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048
 *      - rule #0: 10.0.0.5
 *      - rule #1: 192.168.1.0/24
 *      - rule #2: 192.168.2.0/31
 *
 *   (3.4) insert references to the rules in the lookup table, selecting buckets
 *   according to bit values of a rule in the given group
 *
 *      Example: given:
 *      - rule #0: 10.0.0.5 mapping to buckets
 *        < 0 10  0 0   0 0  0 5 >
 *      - rule #1: 192.168.1.0/24 mapping to buckets
 *        < 12 0  10 8  0 1  < 0..15 > < 0..15 > >
 *      - rule #2: 192.168.2.0/31 mapping to buckets
 *        < 12 0  10 8  0 2  0 < 0..1 > >
 *
 *      these bits are set in the lookup table:
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0    0                                              1,2
 *        1   1,2                                      0
 *        2    0                                      1,2
 *        3    0                              1,2
 *        4  0,1,2
 *        5    0   1   2
 *        6  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        7   1,2 1,2  1   1   1  0,1  1   1   1   1   1   1   1   1   1   1
 *
 *   (3.5) if this is not the last field in the set, build a mapping table that
 *   maps rules from the lookup table to rules belonging to the same entry in
 *   the next lookup table
 *
 *      Example: 2 entries, 10.0.0.5:1024 and 192.168.1.0-192.168.2.1:2048
 *
 *      given lookup table #0 (see example above):
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0    0                                              1,2
 *        1   1,2                                      0
 *        2    0                                      1,2
 *        3    0                              1,2
 *        4  0,1,2
 *        5    0   1   2
 *        6  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        7   1,2 1,2  1   1   1  0,1  1   1   1   1   1   1   1   1   1   1
 *
 *      and lookup table #1 with:
 *      - rule #0: 1024 mapping to buckets
 *        < 0  0  4  0 >
 *      - rule #1: 2048 mapping to buckets
 *        < 0  0  5  0 >
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0   0,1
 *        1   0,1
 *        2                    0   1
 *        3   0,1
 *
 *      we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024
 *      in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1
 *      (rules #1, #2) to 2048 in lookup table #2 (rule #1):
 *
 *                 rules
 *               0   1   2
 *      map to:  0   1   1
 *
 *   (3.6) if this is the last field in the set, build a mapping table that maps
 *   rules from the last lookup table to verdict keys
 *
 *      Example: 10.0.0.5:1024 gives 66 as verdict and
 *      192.168.1.0-192.168.2.1:2048 gives 42. From the rules of lookup table #2
 *      as mapped above:
 *
 *                 rules
 *               0   1
 *      map to: 66  42
 *
 *
 * == 4. Matching
 *
 * (4.1) We use a result bitmap, with the size of a single lookup table bucket,
 * to represent the matching state that applies at every algorithm step.
 *
 * - For each packet field:
 *
 *   (4.2) start with an all-ones result bitmap
 *
 *   (4.3) perform a lookup into the table corresponding to the current field,
 *   for each group, and at every group, AND the current result bitmap with the
 *   value from the lookup table bucket
 *
 *      Example: 192.168.1.5 < 12 0  10 8  0 1  0 5 >, with lookup table from
 *      pre-computation example.
 *      Lookup table buckets are at least 3 bits wide, we'll arbitrarily assume
 *      they are 8 bits for convenience. Initial result bitmap is 0xff, the
 *      steps below show the value of the result bitmap after each group is
 *      processed:
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0    0                                              1,2
 *
 *        result bitmap is now: 0xff & 0x6 [bucket 12] = 0x6
 *
 *        1   1,2                                      0
 *
 *        result bitmap is now: 0x6 & 0x6 [bucket 0] = 0x6
 *
 *        2    0                                      1,2
 *
 *        result bitmap is now: 0x6 & 0x6 [bucket 10] = 0x6
 *
 *        3    0                              1,2
 *
 *        result bitmap is now: 0x6 & 0x6 [bucket 8] = 0x6
 *
 *        4  0,1,2
 *
 *        result bitmap is now: 0x6 & 0x7 [bucket 0] = 0x6
 *
 *        5    0   1   2
 *
 *        result bitmap is now: 0x6 & 0x2 [bucket 1] = 0x2
 *
 *        6  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *
 *        result bitmap is now: 0x2 & 0x7 [bucket 0] = 0x2
 *
 *        7   1,2 1,2  1   1   1  0,1  1   1   1   1   1   1   1   1   1   1
 *
 *        final result bitmap for this field is: 0x2 & 0x3 [bucket 5] = 0x2
 *
 *   (4.4) at the next field, start with a new, all-zeroes result bitmap. For
 *   each bit set in the previous result bitmap, OR the new result bitmap with
 *   the corresponding value from the mapping table for this field
 *
 *      Example: with mapping table from pre-computation example, current result
 *      bitmap 0x02:
 *
 *                 rules
 *               0   1   2
 *      map to:  0   1   1
 *
 *      new result bitmap is 0x00 | 0x02 [bit/rule 1 was set] = 0x02
 *
 *      we can now extend this example to cover the second iteration of the step
 *      above (lookup and AND bitmap): assuming the port field is
 *      2048 < 0  0  5  0 >, with starting result bitmap 0x2, and lookup table
 *      for "port" field from pre-computation example:
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0   0,1
 *        1   0,1
 *        2                    0   1
 *        3   0,1
 *
 *       operations are: 0x2 & 0x3 [bucket 0] & 0x3 [bucket 0] & 0x2 [bucket 5]
 *       & 0x3 [bucket 0], resulting bitmap is 0x2.
 *
 *   (4.5) if this is the last field in the set, look up the value from the
 *   verdict table corresponding to the final result bitmap
 *
 *      Example: 0x2 resulting bitmap from 192.168.1.5:2048, verdict table from
 *      pre-computation example:
 *
 *                 rules
 *               0   1
 *      map to: 66  42
 *
 *      the verdict is 42.
 *
 *
 * == 5. Ideas for Future Optimisations
 *
 * (5.1) Reduction of number of rules by allowing inverted netmasks
 *
 * CAVEAT: This needs some complicated tracking for inverted entries, which
 * aren't implemented here at all. In order to further halve the number of
 * rules as described below, we also need to use sets of netmasks, instead of a
 * single netmask for each entry, and intersect between them and their
 * inversions. This also introduces further complexity.
 * 
 * Theorem 3 in [Rottenstreich 2010] shows that every range of b bits can be
 * converted into b bitmaps, with inversions, as also illustrated by section 9
 * in [Kogan 2014].
 *
 * Further, if we allow partial subtractions between netmasks, for each range R
 * of b bits, that can be converted into a b0 number of bitmaps, there exists a
 * complementary range, R1, that can be converted into a b1 number of bitmaps,
 * such that b1 + b0 = b.
 *
 * [TODO: Formal proof. This consideration is intuitively true and was validated
 * by bruteforcing over the IPv4 address space, so we can safely use it in the
 * implementation for the moment.]
 *
 * By construction, this allows us to express any range of b bits as ceil(b / 2)
 * entries, either as reversed or direct entries. By allowing reversed entries,
 * we need to also provide a reverse mapping possibility, which is achieved by
 * modifying the mapping table from pre-computation step 4.5, and the step
 * itself, as follows:
 * - every bit corresponding to a reverse rule is set in the initial result
 *   bitmap, which is not necessarily an all-zeroes bitmap anymore
 * - the transformation step becomes a XOR operation: the XOR step with the
 *   modified initial bitmap reverses the result, if we guarantee that one
 *   single XOR step is performed. This comes, in turn, from optimisation 5.2
 *
 *   Example: match on 0.0.0.1-127.255.255.255:1024 and on 192.168.1.1:1024.
 *   A direct mapping means we need 31 rules for the first address range, using
 *   reversed rules we only need 2 rules instead (rule #0: 0.0.0.0/32,
 *   rule #1: 128.0.0.0/1). Rule #2: 192.168.1.1/32.
 *   First lookup table:
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0    0                               1   1   1   1  1,2  1   1   1
 *        1  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        2   0,1  1   1   1   1   1   1   1   1   1  1,2  1   1   1   1   1
 *        3   0,1  1   1   1   1   1   1   1  1,2  1   1   1   1   1   1   1
 *        4  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        5   0,1 1,2  1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        6  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        7   0,1 1,2  1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *
 *   Second lookup table: rule #0: 1024 for first entry, rule #1: 1024 for
 *                        second entry
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0   0,1
 *        1                   0,1
 *        2   0,1
 *        3   0,1
 *
 *    mapping table: initial bitmap is 0x1 (entry with rule #0 in second table
 *                   is reversed in first table: set bit #0)
 *                 rules
 *                  0   1   2
 *      XOR with: 0x1 0x1 0x2
 *
 * - expansion to reverse rules is selected if the number of reversed rules is
 *   strictly less than the number of needed direct rules.
 *
 * (5.2) Single step for match and fill from result bitmap
 *
 * CAVEAT: This significantly affects complexity of listing and deletion,
 * because once rules are replaced, further steps are needed to rebuild the
 * originating entries.
 * 
 * Step 4.4 loops over all bits set in the current result bitmap: this is
 * needed as set entries might overlap, for one or more fields. The
 * computational complexity of mapping steps is O(m * log(n)), with m > 1
 * overlapping factor, because we have no tight bounds on the number of
 * overlapping entries. However, we can apply a reduction step on lookup tables,
 * and ensure a single bit is set in the result bitmap after each lookup table
 * is considered, with three observations:
 *
 * (5.2.1) In case two entries overlap entirely in a given lookup table, and
 * their rules are present in the previous mapping table (or if it's the first
 * lookup table), the entries can map to the same set of rules without loss of
 * generality.
 *
 *   Example: given entries 192.168.1.0/24:1024 and 192.168.1.0/24:2048, first
 *            lookup table initially has rules:
 *              #0: 192.168.1.0/24
 *              #1: 192.168.1.0/24
 *            and second lookup table has rules:
 *              #0: 1024
 *              #1: 2048
 *            with mapping table #0 -> #0 | #1 -> #1.
 *            After this reduction step, the first lookup table has a single
 *            rule (#0: 192.168.1.0/24), mapping to both rule #0 and #1.
 *
 *   Example: given entries 192.168.1.0/24:1024:10.0.0.1 and
 *            192.168.2.0/24:1024:10.0.0.1, we don't apply any reduction step.
 *            First lookup table has rules:
 *              #0: 192.168.1.0/24
 *              #1: 192.168.2.0/24
 *            and second lookup table has rules:
 *              #0: 1024
 *              #1: 1024
 *            with first mapping table #0 -> #0 | #1 -> #1. As there are no
 *            images in the first lookup table with both rules #0 and #1 set,
 *            these two rules will be preserved. Third lookup table has two
 *            rules:
 *              #0: 10.0.0.1/32
 *              #1: 10.0.0.1/32
 *            and second mapping table is: #0 -> #0 | #1 -> #1.
 *
 * (5.2.2) In case two entries overlap partially in a given lookup table, due to
 * the way we expand ranges to netmasks, one set of rules is always a proper
 * subset of the other overlapping one. That is, we can hit this situation:
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0       x,y
 *        1   x,y x,y  y   y
 *
 * but not:
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0       x,y
 *        1    x  x,y  y   y
 *
 * so in case entries corresponding to x map to a proper subset of the y rules,
 * x rules never occur without y rules. We can then replace the x,y pairs by
 * x, and merge their mapping tables.
 *
 *   Example: entries: 192.168.1.1-192.168.1.2:1
 *              expands to: 192.168.1.1/32:1 (#0:#0), 192.168.1.2/32:1 (#1:#0)
 *                     192.168.1.0-192.168.1.1:2
 *              expands to: 192.168.1.0/31:2 (#2:#1)
 *                     192.168.1.0-192.168.1.2:3
 *              expands to: 192.168.1.0/31:3 (#3:#2), 192.168.1.2/32:3 (#4:#2)
 *
 *   First lookup table (only last group shown):
 *                     bucket
 *      group  0    1    2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *       ...
 *        7   2,3 0,2,3 1,4
 *
 *   Second lookup table (only last group shown):
 *                     bucket
 *      group  0    1    2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *       ...
 *        3         0    1   2
 *
 *   Initial mapping table: #0 -> #0 | #1 -> #0 | #2 -> #1 | #3 -> #2 | #4 -> #2
 *
 *   Apply reduction step from 5.2.1 twice to first lookup table: rule #1
 *   entirely overlaps with #4, #2 entirely overlaps with #3. New lookup table:
 *
 *                     bucket
 *      group  0    1    2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *       ...
 *        7    2   0,2   1
 *
 *   New mapping table: #0 -> #0 | #1 -> #0, #2 | #2 -> #1, #2
 *
 *   Apply reduction step from 5.2.2: 0 never occurs without 2:
 *
 *                     bucket
 *      group  0    1    2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *       ...
 *        7    2    0    1
 *
 *   New mapping table: #0 -> #0, #1, #2 | #1 -> #0, #2 | #2 -> #1, #2
 *
 * (5.2.3) There are no other ways entries can overlap (directly from set
 * theory)
 * [TODO: Formal proof or reference.]
 *
 * (5.3) Single step for invert, match and bitmap fill operations (comes almost
 * for free with 5.2 and 5.1).
 *
 * We can now replace the loop from 4.4 with a single XOR operation
 * depending on the single bit set in the result bitmask after a lookup table is
 * evaluated. Computational complexity of that step becomes, in theory, O(1):
 * a single bit is set, hence lookup of the corresponding bitmap can be done in
 * constant time. For all practical implementations, this is O(log(n)): the
 * number of words in the mapping table grows linearly with the number of
 * entries, but number of rules grows sublinearly and we can find the set bit
 * with very efficient operations, too (e.g. x86's TZCNT, AVX-512 VPLZCNTQ,
 * PPC's CNTLZ, etc.)]
 *
 * (5.4) [TODO: Evaluate if this provides an actual advantage even with an
 * implementation of step 4.3 optionally based on AVX2 VPAND instrinsics.]
 * As an alternative to finding the first bit set in the result bitmap for the
 * mapping table lookup, we can terminate step 4.3 early, once we reach the last
 * group of a table and a set bit is found in the result, and use the position
 * of this bit directly for the mapping table lookup.
 *
 * (5.5) [TODO: Evaluate how bad this is on architectures where branching is
 * expensive, e.g. PPC, find out if it makes sense to implement this optionally
 * depending on architecture.]
 * - We can terminate the algorithm early if the initial result bitmap, as we
 *   start processing the next lookup table, is empty: that means that the
 *   packet won't match any entry
 * - We can terminate step 4.3 early if the result bitmap happens to be empty
 *   after a group is entirely evaluated: this means the result bitmap will be
 *   also empty at the beginning of step 4.4. We can't terminate the whole
 *   algorithm in this case, though, as entries might be represented with an
 *   inverted set of rules.
 *
 * (5.6) The mapping table described in step (3.5) grows in size linearly with
 * the product of rules for the current field and rules for the next field. For
 * entries with non-overlapping rules in a given field, this size would grow
 * with the number of rules for this field, independently of the fact that these
 * rules might map to the same set of rules in the next field.
 *
 *      Example: one entry, 192.168.1.0-192.168.2.1:2048
 *
 *      given lookup table #0 (see example above):
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0    0                                              1,2
 *        1   1,2                                      0
 *        2    0                                      1,2
 *        3    0                              1,2
 *        4  0,1,2
 *        5    0   1   2
 *        6  0,1,2 1   1   1   1   1   1   1   1   1   1   1   1   1   1   1
 *        7   1,2 1,2  1   1   1  0,1  1   1   1   1   1   1   1   1   1   1
 *
 *      and lookup table #1 with:
 *      - rule #0: 1024 mapping to buckets
 *        < 0  0  4  0 >
 *      - rule #1: 2048 mapping to buckets
 *        < 0  0  5  0 >
 *
 *                     bucket
 *      group  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
 *        0   0,1
 *        1   0,1
 *        2                    0   1
 *        3   0,1
 *
 *      we need to map rules for 10.0.0.5 in lookup table #0 (rule #0) to 1024
 *      in lookup table #1 (rule #0) and rules for 192.168.1.0-192.168.2.1
 *      (rules #1, #2) to 2048 in lookup table #2 (rule #1):
 *
 *                 rules
 *               0   1   2
 *      map to:  0   1   1
 *
 *
 * == 6. References
 *
 * [Ligatti 2010]
 *	A Packet-classification Algorithm for Arbitrary Bitmask Rules, with
 *	Automatic Time-space Tradeoffs
 *	Jay Ligatti, Josh Kuhn, and Chris Gage.
 *	Proceedings of the IEEE International Conference on Computer
 *	Communication Networks (ICCCN), August 2010.
 *	http://www.cse.usf.edu/~ligatti/papers/grouper-conf.pdf
 *
 * [Rottenstreich 2010]
 *	Worst-Case TCAM Rule Expansion
 *	Ori Rottenstreich and Isaac Keslassy.
 *	2010 Proceedings IEEE INFOCOM, San Diego, CA, 2010.
 *	http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.212.4592&rep=rep1&type=pdf
 *
 * [Kogan 2014]
 *	SAX-PAC (Scalable And eXpressive PAcket Classification)
 *	Kirill Kogan, Sergey Nikolenko, Ori Rottenstreich, William Culhane,
 *	and Patrick Eugster.
 *	Proceedings of the 2014 ACM conference on SIGCOMM, August 2014.
 *	http://www.sigcomm.org/sites/default/files/ccr/papers/2014/August/2619239-2626294.pdf
 */

#ifndef VERBOSE
#define PERF_TEST_BLOCK		100
#endif

#include <stdio.h>
#include <arpa/inet.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <errno.h>
#include <time.h>

#include "pipapo.h"
#include "set.h"
#include "match.h"
#include "util.h"

/* Specifiers for set description: label, type, length, and offset in packet */
struct desc_spec desc_specs[] = {
	{ "key",	KEY,	0,		0 },
	{ "dmac",	MAC,	MAC_LEN,	0 },
	{ "smac",	MAC,	MAC_LEN,	6 },
	{ "saddr",	ADDR,	ADDR_LEN,	26 },
	{ "daddr",	ADDR,	ADDR_LEN,	30 },
	{ "sport",	PORT,	PORT_LEN,	34 },
	{ "dport",	PORT,	PORT_LEN,	36 },
	{ "saddr6",	ADDR6,	ADDR6_LEN,	22 },
	{ "daddr6",	ADDR6,	ADDR6_LEN,	38 },
	{ "sport6",	PORT,	PORT_LEN,	54 },
	{ "dport6",	PORT,	PORT_LEN,	56 },
	{ NULL,		0,	0,		0 },
};

/**
 * desc_parse_spec() - Parse set layout from tokens on first line of set file
 * @d:		Pre-allocated set description to fill in
 * @n:		Number of field tokens
 * @tokens:	Labels of fields in desired set layout
 *
 * Return: 0 on success, negative error code on failure
 */
int desc_parse_spec(struct desc *d, int n, char (*tokens)[BUFSIZ])
{
	struct desc_spec *spec;
	int i;

	d->row_size = sizeof(enum set_ops);

	for (i = 0; i < n; i++) {
		for (spec = desc_specs; spec->label; spec++) {
			if (!strncmp(tokens[i], spec->label, BUFSIZ))
				break;
		}

		if (!spec->label) {
			fprintf(stderr, "Invalid set specifier token %s\n",
				tokens[i]);
			return -EINVAL;
		}

		d->layout[i] = spec;

		if (spec->type == KEY) {
			d->row_size += sizeof(uint32_t);
			continue;
		}

		d->fields++;
		if (spec->type == ADDR)
			d->row_size += sizeof(struct addr);
		else if (spec->type == ADDR6)
			d->row_size += sizeof(struct addr6);
		else if (spec->type == PORT)
			d->row_size += sizeof(struct port);
		else if (spec->type == MAC)
			d->row_size += sizeof(struct mac);
	}

	return 0;
}

/**
 * desc_parse_entry() - Parse set entries from set file
 * @d:		Pre-allocated set description, d->data to be filled
 * @at:		Starting position in d->data to be filled with next set entry
 * @n:		Number of fields
 * @tokens:	Set items: addresses, ports, ranges thereof
 *
 * Return: 0 on success, negative error code on failure
 */
int desc_parse_entry(struct desc *d, int at, int n, char (*tokens)[BUFSIZ])
{
	int i, base = at * d->row_size, offset, b;
	struct addr6 a6;
	struct addr a;
	struct port p;
	char *c, *end;
	struct mac m;
	uint32_t key;
	uint8_t v[6];

	if (!strncmp(tokens[0], "list", strlen(tokens[0]))) {
		*(enum set_ops *)(d->data + base) = LIST;
		return 0;
	}

	if (!strncmp(tokens[0], "add", strlen(tokens[0])))
		*(enum set_ops *)(d->data + base) = ADD;
	else if (!strncmp(tokens[0], "delete", strlen(tokens[0])))
		*(enum set_ops *)(d->data + base) = DEL;
	else
		return -EINVAL;
	offset = sizeof(enum set_ops);
	tokens++;
	n--;

	for (i = 0; i < n; i++) {
		switch (d->layout[i]->type) {
		case KEY:
			key = strtoul(tokens[i], &end, 10);
			if (*end)
				return -EINVAL;

			memcpy(d->data + base + offset, &key, sizeof(key));
			offset += sizeof(key);
			break;
		case ADDR:
			/* IPv4... */
			if ((c = strchr(tokens[i], '-'))) {
				/* range */
				if (!inet_aton(c + 1, (struct in_addr *)&a.end))
					return -EINVAL;
				a.cidr = 0;
				*c = 0;
			} else if ((c = strchr(tokens[i], '/'))) {
				/* netmask */
				*c = 0;
				a.cidr = strtoul(c + 1, &end, 10);
				if (*end)
					return -EINVAL;
			} else {
				/* address */
				a.cidr = 32;
			}

			if (!inet_aton(tokens[i], (struct in_addr *)&a.start))
				return -EINVAL;

			memcpy(d->data + base + offset, &a, sizeof(a));
			offset += sizeof(a);
			break;
		case ADDR6:
			/* IPv6... */
			if ((c = strchr(tokens[i], '-'))) {
				/* range */
				if (!inet_pton(AF_INET6, c + 1, &a6.end))
					return -EINVAL;
				a6.cidr = 0;
				*c = 0;
			} else if ((c = strchr(tokens[i], '/'))) {
				/* netmask */
				*c = 0;
				a6.cidr = strtoul(c + 1, &end, 10);
				if (*end)
					return -EINVAL;
			} else {
				/* address */
				a6.cidr = 128;
			}

			if (!inet_pton(AF_INET6, tokens[i], &a6.start))
				return -EINVAL;

			memcpy(d->data + base + offset, &a6, sizeof(a6));
			offset += sizeof(a6);
			break;
		case PORT:
			if ((c = strchr(tokens[i], '-'))) {
				*c = 0;
				p.end = htons(strtoul(c + 1, &end, 10));
				if (*end)
					return -EINVAL;
			} else {
				p.end = 0;
			}

			p.start = htons(strtoul(tokens[i], &end, 10));
			if (*end)
				return -EINVAL;

			memcpy(d->data + base + offset, &p, sizeof(p));
			offset += sizeof(p);
			break;
		case MAC:
			if ((c = strchr(tokens[i], '-')))
				*c = 0;

			b = sscanf(tokens[i],
				   "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx%*c",
				   v + 0, v + 1, v + 2, v + 3, v + 4, v + 5);
			if (b != 6)
				return -EINVAL;
			memcpy(&m.start, v, 6);

			if (c) {
				b = sscanf(c + 1,
					   "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx%*c",
					   v + 0, v + 1, v + 2,
					   v + 3, v + 4, v + 5);
				if (b != 6)
					return -EINVAL;
				memcpy(&m.end, v, 6);
			} else {
				memset(&m.end, 0, 6);
			}

			memcpy(d->data + base + offset, &m, sizeof(m));
			offset += sizeof(m);
			break;
		}
	}

	return 0;
}

/**
 * usage() - Display usage and exit
 * @argv0:	Executable name
 */
void usage(const char *argv0)
{
	fprintf(stderr, "%s <ruleset> <packets>\n", argv0);
	exit(1);
}

/**
 * packet_parse() - Parse packets and expected key from path into binary format
 * @path:	Path to packet file
 * @ret:	Pointer to packet buffer, incremented by this function
 *
 * Return: 0 on success, negative error code on failure
 */
static int packet_parse(const char *path, uint8_t **ret)
{
	int alloc, n, byte_count, packet_count = 0, err;
	uint8_t *packets, *pos, *tmp;
	char buf[BUFSIZ], *end;
	char *p;
	FILE *f;

	alloc = BUFSIZ;
	pos = packets = malloc(alloc * (sizeof(uint32_t) + PACKET_SIZE));
	if (!packets)
		return -ENOMEM;

	f = fopen(path, "r");
	if (!f) {
		err = -errno;
		goto fail;
	}

	while (fgets(buf, BUFSIZ, f)) {
		if (!*buf || *buf == '#' || *buf == ' ' || *buf == '\n')
			continue;

		if (++packet_count > alloc) {
			alloc += BUFSIZ;
			tmp = realloc(packets,
				      alloc * (sizeof(uint32_t) + PACKET_SIZE));
			if (!tmp) {
				err = -ENOMEM;
				goto fail_close;
			}
			packets = tmp;
			pos = packets;
			pos += (packet_count - 1) *
			       (sizeof(uint32_t) + PACKET_SIZE);
		}

		/* Key */
		p = strtok(buf, " \t");
		n = strtoul(p, &end, 0);
		if (*end) {
			err = -EINVAL;
			goto fail_close;
		}
		*(uint32_t *)pos = n;
		pos += sizeof(uint32_t);

		/* Packet bytes */
		byte_count = 0;
		while ((p = strtok(NULL, " \t\n"))) {
			n = strtoul(p, &end, 0);
			if (*end) {
				err = -EINVAL;
				goto fail_close;
			}
			*pos = n;
			pos++;
			if (++byte_count >= PACKET_SIZE)
				break;
		}
		if (byte_count < PACKET_SIZE) {
			memset(pos, 0, PACKET_SIZE - byte_count);
			pos += PACKET_SIZE - byte_count;
		}
	}
	fclose(f);

	*ret = packets;
	verbose("Read %i packets from %s\n", packet_count, path);

	return packet_count;

fail_close:
	fclose(f);
fail:
	free(packets);
	return err;
}

/**
 * desc_parse() - Parse set file from path
 * @path:	Path to set file
 * @d:		Pre-allocated set, with description and data parts
 *
 * Return: 0 on success, negative error code on failure
 */
int desc_parse(const char *path, struct desc *d)
{
	int first = 1, err = 0, alloc = 0, n;
	char t[9][BUFSIZ];
	char buf[BUFSIZ];
	uint8_t *tmp;
	FILE *f;

	f = fopen(path, "r");
	if (!f)
		return -errno;

	while (fgets(buf, BUFSIZ, f)) {
		n = sscanf(buf, "%s %s %s %s %s %s %s %s %s", t[0],
			   t[1], t[2], t[3], t[4], t[5], t[6], t[7], t[8]);
		if (!n || *t[0] == '#')
			continue;

		if (first) {
			/* First line is the set type specifier */
			err = desc_parse_spec(d, n, t);
			if (err)
				goto out;

			first = 0;
			continue;
		}

		if (d->entries == alloc) {
			alloc += BUFSIZ;
			tmp = realloc(d->data, alloc * d->row_size);
			if (!tmp) {
				err = -ENOMEM;
				goto out;
			}
			d->data = tmp;
		}

		err = desc_parse_entry(d, d->entries++, n, t);
	}

out:
	fclose(f);
	return err;
}

/**
 * main() - Entry point: parse set and packets, pre-compute, match
 * @argc:	Argument count
 * @argv:	Path to set description and packet files
 *
 * Return: 0 on success, 1 on matching failure, negative error code otherwise
 */
int main(int argc, char **argv)
{
	int packet_count, offset = 0, i, err;
	uint8_t *packets, *ptr;
	struct desc d = { 0 };
	struct kernel_set *ks;
	struct set s = { 0 };
	uint32_t key;
#ifdef VERBOSE
	char *total_mem_suffix = "B";
	int total_mem = 0;
	struct field *f;
#else
	struct timespec tp_start, tp_end;
	int repeat = 0, b = 0;
	double time = 0;
#endif
	(void)key;

	if (argc != 3)
		usage(argv[0]);

	verbose("=== Parsing\n");

	if ((packet_count = packet_parse(argv[2], &packets)) < 0) {
		fprintf(stderr, "Parsing failed, %s\n", strerror(packet_count));
		exit(packet_count);
	}

	if ((err = desc_parse(argv[1], &d))) {
		fprintf(stderr, "Parsing failed, %s\n", strerror(err));
		return err;
	}
	verbose("\n");

	verbose("=== Pre-computation\n");

	if ((err = init(&s, d.layout))) {
		fprintf(stderr, "Set init failed, %s\n", strerror(err));
		return err;
	}

	for (i = 0; i < d.entries; i++) {
		if (*(enum set_ops *)(d.data + offset) == ADD)
			err = add(&s, d.layout,
				  d.data + offset + sizeof(enum set_ops));
		else if (*(enum set_ops *)(d.data + offset) == LIST)
			list_or_del(&s, d.layout, NULL);
		else if (*(enum set_ops *)(d.data + offset) == DEL)
			err = list_or_del(&s, d.layout,
					  d.data + offset +
					  sizeof(enum set_ops));

		if (err)
			return err;
		offset += d.row_size;
	}

	for (i = 0; d.layout[i]->type != KEY; i++) {
		verbose("\n");
		verbose("Lookup table for %s:\n", d.layout[i]->label);
		show_lookup(&s.fields[i]);
		verbose("Mapping table for %s:\n", d.layout[i]->label);
		show_mapping(&s.fields[i], d.layout[i + 1]->type == KEY);
	}
	verbose("\n");

	verbose("=== Match\n");

	ks = kernel_init(&s, d.layout);

#ifndef VERBOSE
repeat_block:
	repeat = PERF_TEST_BLOCK;
	b++;
	clock_gettime(CLOCK_MONOTONIC, &tp_start);
	do {
#endif
		ptr = packets;
		for (i = 0; i < packet_count; i++) {
			key = match(ks, ptr + sizeof(uint32_t));
#ifdef VERBOSE
			verbose("Packet #%i => key: %u\n\n", i, key);
			if (key != *(uint32_t *)ptr) {
				verbose("FAIL: expected key: %u\n",
					*(uint32_t *)ptr);
				return 1;
			}
#endif
			ptr += PACKET_SIZE + sizeof(uint32_t);
		}
#ifndef VERBOSE
	} while (repeat--);
	clock_gettime(CLOCK_MONOTONIC, &tp_end);
	time += (tp_end.tv_sec - tp_start.tv_sec) +
	        (tp_end.tv_nsec - tp_start.tv_nsec) / 1E9;
	if (time < 5) {
		goto repeat_block;
	}

	fprintf(stdout, "Matched %lfM packets in %lfs (%lf Mpps)\n",
		PERF_TEST_BLOCK * b * packet_count / 1000.0 / 1000,
		time,
		PERF_TEST_BLOCK * b * packet_count / 1000.0 / 1000 / time);
#endif

#ifdef VERBOSE
	verbose("\n");

	verbose("\n=== Memory Usage\n");
	for_each_field(f, &s, d.layout) {
		char *lt_suffix = "B", *mt_suffix = "B", *field_suffix = "B";
		int lt_size = f->groups * BUCKETS * f->bsize;
		int mt_size = f->rules * sizeof(*f->mt);
		int field_size;

		total_mem += field_size = lt_size + mt_size + sizeof(*f);

		if (lt_size > 20 * 1024) {
			lt_size /= 1024;
			lt_suffix = "KiB";
		}

		if (mt_size > 20 * 1024) {
			mt_size /= 1024;
			mt_suffix = "KiB";
		}

		if (field_size > 20 * 1024) {
			field_size /= 1024;
			field_suffix = "KiB";
		}

		verbose("Field %i, %s, %i rules:\n", i, d.layout[i]->label,
			f->rules);
		verbose("  lookup table: %i%s\n", lt_size, lt_suffix);
		free(f->lt);
		verbose("  mapping table: %i%s\n", mt_size, mt_suffix);
		free(f->mt);
		verbose("  total: %i%s\n", field_size, field_suffix);
		verbose("\n");
	}

	if (total_mem > 50 * 1024 * 1024) {
		total_mem /= 1024 * 1024;
		total_mem_suffix = "MiB";
	} else if (total_mem > 20 * 1024) {
		total_mem /= 1024;
		total_mem_suffix = "KiB";
	}
	verbose("Total: %i%s\n", total_mem, total_mem_suffix);
#endif

	/* Rest of allocated data is volatile */
	free(s.fields);
	free(d.data);
	free(packets);
	free(ks->map[0]);
	free(ks->map[1]);
	for (i = 0; i < 8; i++) {
		free(ks->lt[i]);
		free(ks->mt[i]);
	}
	free(ks);

	return 0;
}