bpf: Implement BPF ring buffer and verifier support for it

author Andrii Nakryiko <andriin@fb.com>

Fri, 29 May 2020 07:54:20 +0000 (00:54 -0700)

committer Alexei Starovoitov <ast@kernel.org>

Mon, 1 Jun 2020 21:38:22 +0000 (14:38 -0700)
author Andrii Nakryiko <andriin@fb.com>
Fri, 29 May 2020 07:54:20 +0000 (00:54 -0700)
committer Alexei Starovoitov <ast@kernel.org>
Mon, 1 Jun 2020 21:38:22 +0000 (14:38 -0700)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index efe8836b5c48a545778ea083a63cc42004c07853..e5884f7f801cc901886ad8f442c1c8568006771e 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -90,6 +90,8 @@ struct bpf_map_ops {
         int (*map_direct_value_meta)(const struct bpf_map *map,
                                      u64 imm, u32 *off);
         int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
+       __poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
+                            struct poll_table_struct *pts);
  };
  
  struct bpf_map_memory {
@@ -244,6 +246,9 @@ enum bpf_arg_type {
         ARG_PTR_TO_LONG,        /* pointer to long */
         ARG_PTR_TO_SOCKET,      /* pointer to bpf_sock (fullsock) */
         ARG_PTR_TO_BTF_ID,      /* pointer to in-kernel struct */
+       ARG_PTR_TO_ALLOC_MEM,   /* pointer to dynamically allocated memory */
+       ARG_PTR_TO_ALLOC_MEM_OR_NULL,   /* pointer to dynamically allocated memory or NULL */
+       ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested */
  };
  
  /* type of values returned from helper functions */
@@ -255,6 +260,7 @@ enum bpf_return_type {
         RET_PTR_TO_SOCKET_OR_NULL,      /* returns a pointer to a socket or NULL */
         RET_PTR_TO_TCP_SOCK_OR_NULL,    /* returns a pointer to a tcp_sock or NULL */
         RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
+       RET_PTR_TO_ALLOC_MEM_OR_NULL,   /* returns a pointer to dynamically allocated memory or NULL */
  };
  
  /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
@@ -322,6 +328,8 @@ enum bpf_reg_type {
         PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
         PTR_TO_BTF_ID,           /* reg points to kernel struct */
         PTR_TO_BTF_ID_OR_NULL,   /* reg points to kernel struct or NULL */
+       PTR_TO_MEM,              /* reg points to valid memory region */
+       PTR_TO_MEM_OR_NULL,      /* reg points to valid memory region or NULL */
  };
  
  /* The information passed from prog-specific *_is_valid_access
@@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto;
  extern const struct bpf_func_proto bpf_jiffies64_proto;
  extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
  extern const struct bpf_func_proto bpf_event_output_data_proto;
+extern const struct bpf_func_proto bpf_ringbuf_output_proto;
+extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
+extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
+extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
+extern const struct bpf_func_proto bpf_ringbuf_query_proto;
  
  const struct bpf_func_proto *bpf_tracing_func_proto(
         enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h

index 29d22752fc870d5b098fbe47cde9497cf932f95a..fa8e1b552acd04290645d677fd8dba682b594f4b 100644 (file)
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
  #if defined(CONFIG_BPF_JIT)
  BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
  #endif
+BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
  
  BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
  BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h

index ea833087e85350e000823ca1a5a36ae1addd2dad..ca08db4ffb5f74f6ae4655322499d4df4cdd2d72 100644 (file)
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -54,6 +54,8 @@ struct bpf_reg_state {
  
                 u32 btf_id; /* for PTR_TO_BTF_ID */
  
+               u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
+
                 /* Max size from any of the above. */
                 unsigned long raw;
         };
@@ -63,6 +65,8 @@ struct bpf_reg_state {
          * offset, so they can share range knowledge.
          * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
          * came from, when one is tested for != NULL.
+        * For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
+        * for the purpose of tracking that it's freed.
          * For PTR_TO_SOCKET this is used to share which pointers retain the
          * same reference to the socket, to determine proper reference freeing.
          */
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 54b93f8b49b83801ffe2177a5bde3bb4c07b6f75..974ca6e948e38e3689b827820ab1ce0eeec0990b 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -147,6 +147,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_SK_STORAGE,
         BPF_MAP_TYPE_DEVMAP_HASH,
         BPF_MAP_TYPE_STRUCT_OPS,
+       BPF_MAP_TYPE_RINGBUF,
  };
  
  /* Note that tracing related programs such as
@@ -3157,6 +3158,59 @@ union bpf_attr {
   *             **bpf_sk_cgroup_id**\ ().
   *     Return
   *             The id is returned or 0 in case the id could not be retrieved.
+ *
+ * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ *     Description
+ *             Copy *size* bytes from *data* into a ring buffer *ringbuf*.
+ *             If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
+ *             new data availability is sent.
+ *             IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
+ *             new data availability is sent unconditionally.
+ *     Return
+ *             0, on success;
+ *             < 0, on error.
+ *
+ * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
+ *     Description
+ *             Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ *     Return
+ *             Valid pointer with *size* bytes of memory available; NULL,
+ *             otherwise.
+ *
+ * void bpf_ringbuf_submit(void *data, u64 flags)
+ *     Description
+ *             Submit reserved ring buffer sample, pointed to by *data*.
+ *             If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
+ *             new data availability is sent.
+ *             IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
+ *             new data availability is sent unconditionally.
+ *     Return
+ *             Nothing. Always succeeds.
+ *
+ * void bpf_ringbuf_discard(void *data, u64 flags)
+ *     Description
+ *             Discard reserved ring buffer sample, pointed to by *data*.
+ *             If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
+ *             new data availability is sent.
+ *             IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
+ *             new data availability is sent unconditionally.
+ *     Return
+ *             Nothing. Always succeeds.
+ *
+ * u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
+ *     Description
+ *             Query various characteristics of provided ring buffer. What
+ *             exactly is queries is determined by *flags*:
+ *               - BPF_RB_AVAIL_DATA - amount of data not yet consumed;
+ *               - BPF_RB_RING_SIZE - the size of ring buffer;
+ *               - BPF_RB_CONS_POS - consumer position (can wrap around);
+ *               - BPF_RB_PROD_POS - producer(s) position (can wrap around);
+ *             Data returned is just a momentary snapshots of actual values
+ *             and could be inaccurate, so this facility should be used to
+ *             power heuristics and for reporting, not to make 100% correct
+ *             calculation.
+ *     Return
+ *             Requested value, or 0, if flags are not recognized.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -3288,7 +3342,12 @@ union bpf_attr {
         FN(seq_printf),                 \
         FN(seq_write),                  \
         FN(sk_cgroup_id),               \
-       FN(sk_ancestor_cgroup_id),
+       FN(sk_ancestor_cgroup_id),      \
+       FN(ringbuf_output),             \
+       FN(ringbuf_reserve),            \
+       FN(ringbuf_submit),             \
+       FN(ringbuf_discard),            \
+       FN(ringbuf_query),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
@@ -3398,6 +3457,29 @@ enum {
         BPF_F_GET_BRANCH_RECORDS_SIZE   = (1ULL << 0),
  };
  
+/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
+ * BPF_FUNC_bpf_ringbuf_output flags.
+ */
+enum {
+       BPF_RB_NO_WAKEUP                = (1ULL << 0),
+       BPF_RB_FORCE_WAKEUP             = (1ULL << 1),
+};
+
+/* BPF_FUNC_bpf_ringbuf_query flags */
+enum {
+       BPF_RB_AVAIL_DATA = 0,
+       BPF_RB_RING_SIZE = 1,
+       BPF_RB_CONS_POS = 2,
+       BPF_RB_PROD_POS = 3,
+};
+
+/* BPF ring buffer constants */
+enum {
+       BPF_RINGBUF_BUSY_BIT            = (1U << 31),
+       BPF_RINGBUF_DISCARD_BIT         = (1U << 30),
+       BPF_RINGBUF_HDR_SZ              = 8,
+};
+
  /* Mode for BPF_FUNC_skb_adjust_room helper. */
  enum bpf_adj_room_mode {
         BPF_ADJ_ROOM_NET,
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile

index 375b933010dd48054839f59d12c0b38a57c5f5f2..8fca02f64811ef67ce8029f2db866e94adaee7bf 100644 (file)
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init)
  
  obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
  obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
-obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o
+obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
  obj-$(CONFIG_BPF_SYSCALL) += disasm.o
  obj-$(CONFIG_BPF_JIT) += trampoline.o
  obj-$(CONFIG_BPF_SYSCALL) += btf.o
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c

index bb4fb634275e4ff07f8827f826b5d7e3e08227da..be43ab3e619f973d49cceccae8369b8ec2918eaf 100644 (file)
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id)
                 return &bpf_ktime_get_ns_proto;
         case BPF_FUNC_ktime_get_boot_ns:
                 return &bpf_ktime_get_boot_ns_proto;
+       case BPF_FUNC_ringbuf_output:
+               return &bpf_ringbuf_output_proto;
+       case BPF_FUNC_ringbuf_reserve:
+               return &bpf_ringbuf_reserve_proto;
+       case BPF_FUNC_ringbuf_submit:
+               return &bpf_ringbuf_submit_proto;
+       case BPF_FUNC_ringbuf_discard:
+               return &bpf_ringbuf_discard_proto;
+       case BPF_FUNC_ringbuf_query:
+               return &bpf_ringbuf_query_proto;
         default:
                 break;
         }
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c

new file mode 100644 (file)

index 0000000..180414b
--- /dev/null
+++ b/kernel/bpf/ringbuf.c
@@ -0,0 +1,501 @@
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/err.h>
+#include <linux/irq_work.h>
+#include <linux/slab.h>
+#include <linux/filter.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/poll.h>
+#include <uapi/linux/btf.h>
+
+#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
+
+/* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
+#define RINGBUF_PGOFF \
+       (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
+/* consumer page and producer page */
+#define RINGBUF_POS_PAGES 2
+
+#define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
+
+/* Maximum size of ring buffer area is limited by 32-bit page offset within
+ * record header, counted in pages. Reserve 8 bits for extensibility, and take
+ * into account few extra pages for consumer/producer pages and
+ * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
+ * ring buffer.
+ */
+#define RINGBUF_MAX_DATA_SZ \
+       (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
+
+struct bpf_ringbuf {
+       wait_queue_head_t waitq;
+       struct irq_work work;
+       u64 mask;
+       struct page **pages;
+       int nr_pages;
+       spinlock_t spinlock ____cacheline_aligned_in_smp;
+       /* Consumer and producer counters are put into separate pages to allow
+        * mapping consumer page as r/w, but restrict producer page to r/o.
+        * This protects producer position from being modified by user-space
+        * application and ruining in-kernel position tracking.
+        */
+       unsigned long consumer_pos __aligned(PAGE_SIZE);
+       unsigned long producer_pos __aligned(PAGE_SIZE);
+       char data[] __aligned(PAGE_SIZE);
+};
+
+struct bpf_ringbuf_map {
+       struct bpf_map map;
+       struct bpf_map_memory memory;
+       struct bpf_ringbuf *rb;
+};
+
+/* 8-byte ring buffer record header structure */
+struct bpf_ringbuf_hdr {
+       u32 len;
+       u32 pg_off;
+};
+
+static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
+{
+       const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN |
+                           __GFP_ZERO;
+       int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
+       int nr_data_pages = data_sz >> PAGE_SHIFT;
+       int nr_pages = nr_meta_pages + nr_data_pages;
+       struct page **pages, *page;
+       struct bpf_ringbuf *rb;
+       size_t array_size;
+       int i;
+
+       /* Each data page is mapped twice to allow "virtual"
+        * continuous read of samples wrapping around the end of ring
+        * buffer area:
+        * ------------------------------------------------------
+        * | meta pages |  real data pages  |  same data pages  |
+        * ------------------------------------------------------
+        * |            | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
+        * ------------------------------------------------------
+        * |            | TA             DA | TA             DA |
+        * ------------------------------------------------------
+        *                               ^^^^^^^
+        *                                  |
+        * Here, no need to worry about special handling of wrapped-around
+        * data due to double-mapped data pages. This works both in kernel and
+        * when mmap()'ed in user-space, simplifying both kernel and
+        * user-space implementations significantly.
+        */
+       array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
+       if (array_size > PAGE_SIZE)
+               pages = vmalloc_node(array_size, numa_node);
+       else
+               pages = kmalloc_node(array_size, flags, numa_node);
+       if (!pages)
+               return NULL;
+
+       for (i = 0; i < nr_pages; i++) {
+               page = alloc_pages_node(numa_node, flags, 0);
+               if (!page) {
+                       nr_pages = i;
+                       goto err_free_pages;
+               }
+               pages[i] = page;
+               if (i >= nr_meta_pages)
+                       pages[nr_data_pages + i] = page;
+       }
+
+       rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
+                 VM_ALLOC | VM_USERMAP, PAGE_KERNEL);
+       if (rb) {
+               rb->pages = pages;
+               rb->nr_pages = nr_pages;
+               return rb;
+       }
+
+err_free_pages:
+       for (i = 0; i < nr_pages; i++)
+               __free_page(pages[i]);
+       kvfree(pages);
+       return NULL;
+}
+
+static void bpf_ringbuf_notify(struct irq_work *work)
+{
+       struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
+
+       wake_up_all(&rb->waitq);
+}
+
+static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
+{
+       struct bpf_ringbuf *rb;
+
+       if (!data_sz || !PAGE_ALIGNED(data_sz))
+               return ERR_PTR(-EINVAL);
+
+#ifdef CONFIG_64BIT
+       /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
+       if (data_sz > RINGBUF_MAX_DATA_SZ)
+               return ERR_PTR(-E2BIG);
+#endif
+
+       rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
+       if (!rb)
+               return ERR_PTR(-ENOMEM);
+
+       spin_lock_init(&rb->spinlock);
+       init_waitqueue_head(&rb->waitq);
+       init_irq_work(&rb->work, bpf_ringbuf_notify);
+
+       rb->mask = data_sz - 1;
+       rb->consumer_pos = 0;
+       rb->producer_pos = 0;
+
+       return rb;
+}
+
+static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
+{
+       struct bpf_ringbuf_map *rb_map;
+       u64 cost;
+       int err;
+
+       if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
+               return ERR_PTR(-EINVAL);
+
+       if (attr->key_size || attr->value_size ||
+           attr->max_entries == 0 || !PAGE_ALIGNED(attr->max_entries))
+               return ERR_PTR(-EINVAL);
+
+       rb_map = kzalloc(sizeof(*rb_map), GFP_USER);
+       if (!rb_map)
+               return ERR_PTR(-ENOMEM);
+
+       bpf_map_init_from_attr(&rb_map->map, attr);
+
+       cost = sizeof(struct bpf_ringbuf_map) +
+              sizeof(struct bpf_ringbuf) +
+              attr->max_entries;
+       err = bpf_map_charge_init(&rb_map->map.memory, cost);
+       if (err)
+               goto err_free_map;
+
+       rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
+       if (IS_ERR(rb_map->rb)) {
+               err = PTR_ERR(rb_map->rb);
+               goto err_uncharge;
+       }
+
+       return &rb_map->map;
+
+err_uncharge:
+       bpf_map_charge_finish(&rb_map->map.memory);
+err_free_map:
+       kfree(rb_map);
+       return ERR_PTR(err);
+}
+
+static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
+{
+       /* copy pages pointer and nr_pages to local variable, as we are going
+        * to unmap rb itself with vunmap() below
+        */
+       struct page **pages = rb->pages;
+       int i, nr_pages = rb->nr_pages;
+
+       vunmap(rb);
+       for (i = 0; i < nr_pages; i++)
+               __free_page(pages[i]);
+       kvfree(pages);
+}
+
+static void ringbuf_map_free(struct bpf_map *map)
+{
+       struct bpf_ringbuf_map *rb_map;
+
+       /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+        * so the programs (can be more than one that used this map) were
+        * disconnected from events. Wait for outstanding critical sections in
+        * these programs to complete
+        */
+       synchronize_rcu();
+
+       rb_map = container_of(map, struct bpf_ringbuf_map, map);
+       bpf_ringbuf_free(rb_map->rb);
+       kfree(rb_map);
+}
+
+static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
+{
+       return ERR_PTR(-ENOTSUPP);
+}
+
+static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
+                                  u64 flags)
+{
+       return -ENOTSUPP;
+}
+
+static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
+{
+       return -ENOTSUPP;
+}
+
+static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
+                                   void *next_key)
+{
+       return -ENOTSUPP;
+}
+
+static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb)
+{
+       size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT;
+
+       /* consumer page + producer page + 2 x data pages */
+       return RINGBUF_POS_PAGES + 2 * data_pages;
+}
+
+static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
+{
+       struct bpf_ringbuf_map *rb_map;
+       size_t mmap_sz;
+
+       rb_map = container_of(map, struct bpf_ringbuf_map, map);
+       mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT;
+
+       if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz)
+               return -EINVAL;
+
+       return remap_vmalloc_range(vma, rb_map->rb,
+                                  vma->vm_pgoff + RINGBUF_PGOFF);
+}
+
+static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
+{
+       unsigned long cons_pos, prod_pos;
+
+       cons_pos = smp_load_acquire(&rb->consumer_pos);
+       prod_pos = smp_load_acquire(&rb->producer_pos);
+       return prod_pos - cons_pos;
+}
+
+static __poll_t ringbuf_map_poll(struct bpf_map *map, struct file *filp,
+                                struct poll_table_struct *pts)
+{
+       struct bpf_ringbuf_map *rb_map;
+
+       rb_map = container_of(map, struct bpf_ringbuf_map, map);
+       poll_wait(filp, &rb_map->rb->waitq, pts);
+
+       if (ringbuf_avail_data_sz(rb_map->rb))
+               return EPOLLIN | EPOLLRDNORM;
+       return 0;
+}
+
+const struct bpf_map_ops ringbuf_map_ops = {
+       .map_alloc = ringbuf_map_alloc,
+       .map_free = ringbuf_map_free,
+       .map_mmap = ringbuf_map_mmap,
+       .map_poll = ringbuf_map_poll,
+       .map_lookup_elem = ringbuf_map_lookup_elem,
+       .map_update_elem = ringbuf_map_update_elem,
+       .map_delete_elem = ringbuf_map_delete_elem,
+       .map_get_next_key = ringbuf_map_get_next_key,
+};
+
+/* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
+ * calculate offset from record metadata to ring buffer in pages, rounded
+ * down. This page offset is stored as part of record metadata and allows to
+ * restore struct bpf_ringbuf * from record pointer. This page offset is
+ * stored at offset 4 of record metadata header.
+ */
+static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
+                                    struct bpf_ringbuf_hdr *hdr)
+{
+       return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
+}
+
+/* Given pointer to ring buffer record header, restore pointer to struct
+ * bpf_ringbuf itself by using page offset stored at offset 4
+ */
+static struct bpf_ringbuf *
+bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
+{
+       unsigned long addr = (unsigned long)(void *)hdr;
+       unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
+
+       return (void*)((addr & PAGE_MASK) - off);
+}
+
+static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
+{
+       unsigned long cons_pos, prod_pos, new_prod_pos, flags;
+       u32 len, pg_off;
+       struct bpf_ringbuf_hdr *hdr;
+
+       if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
+               return NULL;
+
+       len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
+       cons_pos = smp_load_acquire(&rb->consumer_pos);
+
+       if (in_nmi()) {
+               if (!spin_trylock_irqsave(&rb->spinlock, flags))
+                       return NULL;
+       } else {
+               spin_lock_irqsave(&rb->spinlock, flags);
+       }
+
+       prod_pos = rb->producer_pos;
+       new_prod_pos = prod_pos + len;
+
+       /* check for out of ringbuf space by ensuring producer position
+        * doesn't advance more than (ringbuf_size - 1) ahead
+        */
+       if (new_prod_pos - cons_pos > rb->mask) {
+               spin_unlock_irqrestore(&rb->spinlock, flags);
+               return NULL;
+       }
+
+       hdr = (void *)rb->data + (prod_pos & rb->mask);
+       pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
+       hdr->len = size | BPF_RINGBUF_BUSY_BIT;
+       hdr->pg_off = pg_off;
+
+       /* pairs with consumer's smp_load_acquire() */
+       smp_store_release(&rb->producer_pos, new_prod_pos);
+
+       spin_unlock_irqrestore(&rb->spinlock, flags);
+
+       return (void *)hdr + BPF_RINGBUF_HDR_SZ;
+}
+
+BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
+{
+       struct bpf_ringbuf_map *rb_map;
+
+       if (unlikely(flags))
+               return 0;
+
+       rb_map = container_of(map, struct bpf_ringbuf_map, map);
+       return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
+}
+
+const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
+       .func           = bpf_ringbuf_reserve,
+       .ret_type       = RET_PTR_TO_ALLOC_MEM_OR_NULL,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_CONST_ALLOC_SIZE_OR_ZERO,
+       .arg3_type      = ARG_ANYTHING,
+};
+
+static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
+{
+       unsigned long rec_pos, cons_pos;
+       struct bpf_ringbuf_hdr *hdr;
+       struct bpf_ringbuf *rb;
+       u32 new_len;
+
+       hdr = sample - BPF_RINGBUF_HDR_SZ;
+       rb = bpf_ringbuf_restore_from_rec(hdr);
+       new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
+       if (discard)
+               new_len |= BPF_RINGBUF_DISCARD_BIT;
+
+       /* update record header with correct final size prefix */
+       xchg(&hdr->len, new_len);
+
+       /* if consumer caught up and is waiting for our record, notify about
+        * new data availability
+        */
+       rec_pos = (void *)hdr - (void *)rb->data;
+       cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
+
+       if (flags & BPF_RB_FORCE_WAKEUP)
+               irq_work_queue(&rb->work);
+       else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
+               irq_work_queue(&rb->work);
+}
+
+BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
+{
+       bpf_ringbuf_commit(sample, flags, false /* discard */);
+       return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_submit_proto = {
+       .func           = bpf_ringbuf_submit,
+       .ret_type       = RET_VOID,
+       .arg1_type      = ARG_PTR_TO_ALLOC_MEM,
+       .arg2_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
+{
+       bpf_ringbuf_commit(sample, flags, true /* discard */);
+       return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_discard_proto = {
+       .func           = bpf_ringbuf_discard,
+       .ret_type       = RET_VOID,
+       .arg1_type      = ARG_PTR_TO_ALLOC_MEM,
+       .arg2_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
+          u64, flags)
+{
+       struct bpf_ringbuf_map *rb_map;
+       void *rec;
+
+       if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
+               return -EINVAL;
+
+       rb_map = container_of(map, struct bpf_ringbuf_map, map);
+       rec = __bpf_ringbuf_reserve(rb_map->rb, size);
+       if (!rec)
+               return -EAGAIN;
+
+       memcpy(rec, data, size);
+       bpf_ringbuf_commit(rec, flags, false /* discard */);
+       return 0;
+}
+
+const struct bpf_func_proto bpf_ringbuf_output_proto = {
+       .func           = bpf_ringbuf_output,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_PTR_TO_MEM,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+};
+
+BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
+{
+       struct bpf_ringbuf *rb;
+
+       rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
+
+       switch (flags) {
+       case BPF_RB_AVAIL_DATA:
+               return ringbuf_avail_data_sz(rb);
+       case BPF_RB_RING_SIZE:
+               return rb->mask + 1;
+       case BPF_RB_CONS_POS:
+               return smp_load_acquire(&rb->consumer_pos);
+       case BPF_RB_PROD_POS:
+               return smp_load_acquire(&rb->producer_pos);
+       default:
+               return 0;
+       }
+}
+
+const struct bpf_func_proto bpf_ringbuf_query_proto = {
+       .func           = bpf_ringbuf_query,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+       .arg2_type      = ARG_ANYTHING,
+};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 2c969a9b90d3b8f7d86c09e1be86e8f926719485..9de3540fa90c4e181212482ec1e9709a66bdc438 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -26,6 +26,7 @@
  #include <linux/audit.h>
  #include <uapi/linux/btf.h>
  #include <linux/bpf_lsm.h>
+#include <linux/poll.h>
  
  #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
                           (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
@@ -662,6 +663,16 @@ out:
         return err;
  }
  
+static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
+{
+       struct bpf_map *map = filp->private_data;
+
+       if (map->ops->map_poll)
+               return map->ops->map_poll(map, filp, pts);
+
+       return EPOLLERR;
+}
+
  const struct file_operations bpf_map_fops = {
  #ifdef CONFIG_PROC_FS
         .show_fdinfo    = bpf_map_show_fdinfo,
@@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = {
         .read           = bpf_dummy_read,
         .write          = bpf_dummy_write,
         .mmap           = bpf_map_mmap,
+       .poll           = bpf_map_poll,
  };
  
  int bpf_map_new_fd(struct bpf_map *map, int flags)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c

index 6d725a26f66e62ff619d296ab0ea498db829e983..5c7bbaac81ef951c840fec66e69ef53d11b8ad31 100644 (file)
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -233,6 +233,7 @@ struct bpf_call_arg_meta {
         bool pkt_access;
         int regno;
         int access_size;
+       int mem_size;
         u64 msize_max_value;
         int ref_obj_id;
         int func_id;
@@ -408,7 +409,8 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
                type == PTR_TO_SOCKET_OR_NULL ||
                type == PTR_TO_SOCK_COMMON_OR_NULL ||
                type == PTR_TO_TCP_SOCK_OR_NULL ||
-              type == PTR_TO_BTF_ID_OR_NULL;
+              type == PTR_TO_BTF_ID_OR_NULL ||
+              type == PTR_TO_MEM_OR_NULL;
  }
  
  static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -422,7 +424,9 @@ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
         return type == PTR_TO_SOCKET ||
                 type == PTR_TO_SOCKET_OR_NULL ||
                 type == PTR_TO_TCP_SOCK ||
-               type == PTR_TO_TCP_SOCK_OR_NULL;
+               type == PTR_TO_TCP_SOCK_OR_NULL ||
+               type == PTR_TO_MEM ||
+               type == PTR_TO_MEM_OR_NULL;
  }
  
  static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
@@ -436,7 +440,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
   */
  static bool is_release_function(enum bpf_func_id func_id)
  {
-       return func_id == BPF_FUNC_sk_release;
+       return func_id == BPF_FUNC_sk_release ||
+              func_id == BPF_FUNC_ringbuf_submit ||
+              func_id == BPF_FUNC_ringbuf_discard;
  }
  
  static bool may_be_acquire_function(enum bpf_func_id func_id)
@@ -444,7 +450,8 @@ static bool may_be_acquire_function(enum bpf_func_id func_id)
         return func_id == BPF_FUNC_sk_lookup_tcp ||
                 func_id == BPF_FUNC_sk_lookup_udp ||
                 func_id == BPF_FUNC_skc_lookup_tcp ||
-               func_id == BPF_FUNC_map_lookup_elem;
+               func_id == BPF_FUNC_map_lookup_elem ||
+               func_id == BPF_FUNC_ringbuf_reserve;
  }
  
  static bool is_acquire_function(enum bpf_func_id func_id,
@@ -454,7 +461,8 @@ static bool is_acquire_function(enum bpf_func_id func_id,
  
         if (func_id == BPF_FUNC_sk_lookup_tcp ||
             func_id == BPF_FUNC_sk_lookup_udp ||
-           func_id == BPF_FUNC_skc_lookup_tcp)
+           func_id == BPF_FUNC_skc_lookup_tcp ||
+           func_id == BPF_FUNC_ringbuf_reserve)
                 return true;
  
         if (func_id == BPF_FUNC_map_lookup_elem &&
@@ -494,6 +502,8 @@ static const char * const reg_type_str[] = {
         [PTR_TO_XDP_SOCK]       = "xdp_sock",
         [PTR_TO_BTF_ID]         = "ptr_",
         [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
+       [PTR_TO_MEM]            = "mem",
+       [PTR_TO_MEM_OR_NULL]    = "mem_or_null",
  };
  
  static char slot_type_char[] = {
@@ -2468,32 +2478,49 @@ static int check_map_access_type(struct bpf_verifier_env *env, u32 regno,
         return 0;
  }
  
-/* check read/write into map element returned by bpf_map_lookup_elem() */
-static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
-                             int size, bool zero_size_allowed)
+/* check read/write into memory region (e.g., map value, ringbuf sample, etc) */
+static int __check_mem_access(struct bpf_verifier_env *env, int regno,
+                             int off, int size, u32 mem_size,
+                             bool zero_size_allowed)
  {
-       struct bpf_reg_state *regs = cur_regs(env);
-       struct bpf_map *map = regs[regno].map_ptr;
+       bool size_ok = size > 0 || (size == 0 && zero_size_allowed);
+       struct bpf_reg_state *reg;
+
+       if (off >= 0 && size_ok && (u64)off + size <= mem_size)
+               return 0;
  
-       if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
-           off + size > map->value_size) {
+       reg = &cur_regs(env)[regno];
+       switch (reg->type) {
+       case PTR_TO_MAP_VALUE:
                 verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
-                       map->value_size, off, size);
-               return -EACCES;
+                       mem_size, off, size);
+               break;
+       case PTR_TO_PACKET:
+       case PTR_TO_PACKET_META:
+       case PTR_TO_PACKET_END:
+               verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+                       off, size, regno, reg->id, off, mem_size);
+               break;
+       case PTR_TO_MEM:
+       default:
+               verbose(env, "invalid access to memory, mem_size=%u off=%d size=%d\n",
+                       mem_size, off, size);
         }
-       return 0;
+
+       return -EACCES;
  }
  
-/* check read/write into a map element with possible variable offset */
-static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-                           int off, int size, bool zero_size_allowed)
+/* check read/write into a memory region with possible variable offset */
+static int check_mem_region_access(struct bpf_verifier_env *env, u32 regno,
+                                  int off, int size, u32 mem_size,
+                                  bool zero_size_allowed)
  {
         struct bpf_verifier_state *vstate = env->cur_state;
         struct bpf_func_state *state = vstate->frame[vstate->curframe];
         struct bpf_reg_state *reg = &state->regs[regno];
         int err;
  
-       /* We may have adjusted the register to this map value, so we
+       /* We may have adjusted the register pointing to memory region, so we
          * need to try adding each of min_value and max_value to off
          * to make sure our theoretical access will be safe.
          */
@@ -2514,10 +2541,10 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
                         regno);
                 return -EACCES;
         }
-       err = __check_map_access(env, regno, reg->smin_value + off, size,
-                                zero_size_allowed);
+       err = __check_mem_access(env, regno, reg->smin_value + off, size,
+                                mem_size, zero_size_allowed);
         if (err) {
-               verbose(env, "R%d min value is outside of the array range\n",
+               verbose(env, "R%d min value is outside of the allowed memory range\n",
                         regno);
                 return err;
         }
@@ -2527,18 +2554,38 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
          * If reg->umax_value + off could overflow, treat that as unbounded too.
          */
         if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-               verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+               verbose(env, "R%d unbounded memory access, make sure to bounds check any such access\n",
                         regno);
                 return -EACCES;
         }
-       err = __check_map_access(env, regno, reg->umax_value + off, size,
-                                zero_size_allowed);
-       if (err)
-               verbose(env, "R%d max value is outside of the array range\n",
+       err = __check_mem_access(env, regno, reg->umax_value + off, size,
+                                mem_size, zero_size_allowed);
+       if (err) {
+               verbose(env, "R%d max value is outside of the allowed memory range\n",
                         regno);
+               return err;
+       }
+
+       return 0;
+}
  
-       if (map_value_has_spin_lock(reg->map_ptr)) {
-               u32 lock = reg->map_ptr->spin_lock_off;
+/* check read/write into a map element with possible variable offset */
+static int check_map_access(struct bpf_verifier_env *env, u32 regno,
+                           int off, int size, bool zero_size_allowed)
+{
+       struct bpf_verifier_state *vstate = env->cur_state;
+       struct bpf_func_state *state = vstate->frame[vstate->curframe];
+       struct bpf_reg_state *reg = &state->regs[regno];
+       struct bpf_map *map = reg->map_ptr;
+       int err;
+
+       err = check_mem_region_access(env, regno, off, size, map->value_size,
+                                     zero_size_allowed);
+       if (err)
+               return err;
+
+       if (map_value_has_spin_lock(map)) {
+               u32 lock = map->spin_lock_off;
  
                 /* if any part of struct bpf_spin_lock can be touched by
                  * load/store reject this program.
@@ -2596,21 +2643,6 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
         }
  }
  
-static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
-                                int off, int size, bool zero_size_allowed)
-{
-       struct bpf_reg_state *regs = cur_regs(env);
-       struct bpf_reg_state *reg = &regs[regno];
-
-       if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
-           (u64)off + size > reg->range) {
-               verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
-                       off, size, regno, reg->id, reg->off, reg->range);
-               return -EACCES;
-       }
-       return 0;
-}
-
  static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
                                int size, bool zero_size_allowed)
  {
@@ -2631,16 +2663,17 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
                         regno);
                 return -EACCES;
         }
-       err = __check_packet_access(env, regno, off, size, zero_size_allowed);
+       err = __check_mem_access(env, regno, off, size, reg->range,
+                                zero_size_allowed);
         if (err) {
                 verbose(env, "R%d offset is outside of the packet\n", regno);
                 return err;
         }
  
-       /* __check_packet_access has made sure "off + size - 1" is within u16.
+       /* __check_mem_access has made sure "off + size - 1" is within u16.
          * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff,
          * otherwise find_good_pkt_pointers would have refused to set range info
-        * that __check_packet_access would have rejected this pkt access.
+        * that __check_mem_access would have rejected this pkt access.
          * Therefore, "off + reg->umax_value + size - 1" won't overflow u32.
          */
         env->prog->aux->max_pkt_offset =
@@ -3220,6 +3253,16 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
                                 mark_reg_unknown(env, regs, value_regno);
                         }
                 }
+       } else if (reg->type == PTR_TO_MEM) {
+               if (t == BPF_WRITE && value_regno >= 0 &&
+                   is_pointer_value(env, value_regno)) {
+                       verbose(env, "R%d leaks addr into mem\n", value_regno);
+                       return -EACCES;
+               }
+               err = check_mem_region_access(env, regno, off, size,
+                                             reg->mem_size, false);
+               if (!err && t == BPF_READ && value_regno >= 0)
+                       mark_reg_unknown(env, regs, value_regno);
         } else if (reg->type == PTR_TO_CTX) {
                 enum bpf_reg_type reg_type = SCALAR_VALUE;
                 u32 btf_id = 0;
@@ -3557,6 +3600,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
                         return -EACCES;
                 return check_map_access(env, regno, reg->off, access_size,
                                         zero_size_allowed);
+       case PTR_TO_MEM:
+               return check_mem_region_access(env, regno, reg->off,
+                                              access_size, reg->mem_size,
+                                              zero_size_allowed);
         default: /* scalar_value|ptr_to_stack or invalid ptr */
                 return check_stack_boundary(env, regno, access_size,
                                             zero_size_allowed, meta);
@@ -3661,6 +3708,17 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
                type == ARG_CONST_SIZE_OR_ZERO;
  }
  
+static bool arg_type_is_alloc_mem_ptr(enum bpf_arg_type type)
+{
+       return type == ARG_PTR_TO_ALLOC_MEM ||
+              type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+}
+
+static bool arg_type_is_alloc_size(enum bpf_arg_type type)
+{
+       return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
+}
+
  static bool arg_type_is_int_ptr(enum bpf_arg_type type)
  {
         return type == ARG_PTR_TO_INT ||
@@ -3720,7 +3778,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                          type != expected_type)
                         goto err_type;
         } else if (arg_type == ARG_CONST_SIZE ||
-                  arg_type == ARG_CONST_SIZE_OR_ZERO) {
+                  arg_type == ARG_CONST_SIZE_OR_ZERO ||
+                  arg_type == ARG_CONST_ALLOC_SIZE_OR_ZERO) {
                 expected_type = SCALAR_VALUE;
                 if (type != expected_type)
                         goto err_type;
@@ -3791,13 +3850,29 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                  * happens during stack boundary checking.
                  */
                 if (register_is_null(reg) &&
-                   arg_type == ARG_PTR_TO_MEM_OR_NULL)
+                   (arg_type == ARG_PTR_TO_MEM_OR_NULL ||
+                    arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL))
                         /* final test in check_stack_boundary() */;
                 else if (!type_is_pkt_pointer(type) &&
                          type != PTR_TO_MAP_VALUE &&
+                        type != PTR_TO_MEM &&
                          type != expected_type)
                         goto err_type;
                 meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
+       } else if (arg_type_is_alloc_mem_ptr(arg_type)) {
+               expected_type = PTR_TO_MEM;
+               if (register_is_null(reg) &&
+                   arg_type == ARG_PTR_TO_ALLOC_MEM_OR_NULL)
+                       /* final test in check_stack_boundary() */;
+               else if (type != expected_type)
+                       goto err_type;
+               if (meta->ref_obj_id) {
+                       verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n",
+                               regno, reg->ref_obj_id,
+                               meta->ref_obj_id);
+                       return -EFAULT;
+               }
+               meta->ref_obj_id = reg->ref_obj_id;
         } else if (arg_type_is_int_ptr(arg_type)) {
                 expected_type = PTR_TO_STACK;
                 if (!type_is_pkt_pointer(type) &&
@@ -3893,6 +3968,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
                                               zero_size_allowed, meta);
                 if (!err)
                         err = mark_chain_precision(env, regno);
+       } else if (arg_type_is_alloc_size(arg_type)) {
+               if (!tnum_is_const(reg->var_off)) {
+                       verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n",
+                               regno);
+                       return -EACCES;
+               }
+               meta->mem_size = reg->var_off.value;
         } else if (arg_type_is_int_ptr(arg_type)) {
                 int size = int_ptr_type_to_size(arg_type);
  
@@ -3929,6 +4011,14 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
                     func_id != BPF_FUNC_xdp_output)
                         goto error;
                 break;
+       case BPF_MAP_TYPE_RINGBUF:
+               if (func_id != BPF_FUNC_ringbuf_output &&
+                   func_id != BPF_FUNC_ringbuf_reserve &&
+                   func_id != BPF_FUNC_ringbuf_submit &&
+                   func_id != BPF_FUNC_ringbuf_discard &&
+                   func_id != BPF_FUNC_ringbuf_query)
+                       goto error;
+               break;
         case BPF_MAP_TYPE_STACK_TRACE:
                 if (func_id != BPF_FUNC_get_stackid)
                         goto error;
@@ -4655,6 +4745,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
                 mark_reg_known_zero(env, regs, BPF_REG_0);
                 regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
                 regs[BPF_REG_0].id = ++env->id_gen;
+       } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+               regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
+               regs[BPF_REG_0].id = ++env->id_gen;
+               regs[BPF_REG_0].mem_size = meta.mem_size;
         } else {
                 verbose(env, "unknown return type %d of func %s#%d\n",
                         fn->ret_type, func_id_name(func_id), func_id);
@@ -6611,6 +6706,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
                         reg->type = PTR_TO_TCP_SOCK;
                 } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) {
                         reg->type = PTR_TO_BTF_ID;
+               } else if (reg->type == PTR_TO_MEM_OR_NULL) {
+                       reg->type = PTR_TO_MEM;
                 }
                 if (is_null) {
                         /* We don't need id and ref_obj_id from this point
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c

index 187cd6995bbb970948dd3e2b0cfce88d3d69abf2..3767d34114c0ac8d8ce082d488a9113ae3276837 100644 (file)
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
                 return &bpf_perf_event_read_value_proto;
         case BPF_FUNC_get_ns_current_pid_tgid:
                 return &bpf_get_ns_current_pid_tgid_proto;
+       case BPF_FUNC_ringbuf_output:
+               return &bpf_ringbuf_output_proto;
+       case BPF_FUNC_ringbuf_reserve:
+               return &bpf_ringbuf_reserve_proto;
+       case BPF_FUNC_ringbuf_submit:
+               return &bpf_ringbuf_submit_proto;
+       case BPF_FUNC_ringbuf_discard:
+               return &bpf_ringbuf_discard_proto;
+       case BPF_FUNC_ringbuf_query:
+               return &bpf_ringbuf_query_proto;
         default:
                 return NULL;
         }
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index 54b93f8b49b83801ffe2177a5bde3bb4c07b6f75..974ca6e948e38e3689b827820ab1ce0eeec0990b 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -147,6 +147,7 @@ enum bpf_map_type {
         BPF_MAP_TYPE_SK_STORAGE,
         BPF_MAP_TYPE_DEVMAP_HASH,
         BPF_MAP_TYPE_STRUCT_OPS,
+       BPF_MAP_TYPE_RINGBUF,
  };
  
  /* Note that tracing related programs such as
@@ -3157,6 +3158,59 @@ union bpf_attr {
   *             **bpf_sk_cgroup_id**\ ().
   *     Return
   *             The id is returned or 0 in case the id could not be retrieved.
+ *
+ * void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ *     Description
+ *             Copy *size* bytes from *data* into a ring buffer *ringbuf*.
+ *             If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
+ *             new data availability is sent.
+ *             IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
+ *             new data availability is sent unconditionally.
+ *     Return
+ *             0, on success;
+ *             < 0, on error.
+ *
+ * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
+ *     Description
+ *             Reserve *size* bytes of payload in a ring buffer *ringbuf*.
+ *     Return
+ *             Valid pointer with *size* bytes of memory available; NULL,
+ *             otherwise.
+ *
+ * void bpf_ringbuf_submit(void *data, u64 flags)
+ *     Description
+ *             Submit reserved ring buffer sample, pointed to by *data*.
+ *             If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
+ *             new data availability is sent.
+ *             IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
+ *             new data availability is sent unconditionally.
+ *     Return
+ *             Nothing. Always succeeds.
+ *
+ * void bpf_ringbuf_discard(void *data, u64 flags)
+ *     Description
+ *             Discard reserved ring buffer sample, pointed to by *data*.
+ *             If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
+ *             new data availability is sent.
+ *             IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
+ *             new data availability is sent unconditionally.
+ *     Return
+ *             Nothing. Always succeeds.
+ *
+ * u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
+ *     Description
+ *             Query various characteristics of provided ring buffer. What
+ *             exactly is queries is determined by *flags*:
+ *               - BPF_RB_AVAIL_DATA - amount of data not yet consumed;
+ *               - BPF_RB_RING_SIZE - the size of ring buffer;
+ *               - BPF_RB_CONS_POS - consumer position (can wrap around);
+ *               - BPF_RB_PROD_POS - producer(s) position (can wrap around);
+ *             Data returned is just a momentary snapshots of actual values
+ *             and could be inaccurate, so this facility should be used to
+ *             power heuristics and for reporting, not to make 100% correct
+ *             calculation.
+ *     Return
+ *             Requested value, or 0, if flags are not recognized.
   */
  #define __BPF_FUNC_MAPPER(FN)          \
         FN(unspec),                     \
@@ -3288,7 +3342,12 @@ union bpf_attr {
         FN(seq_printf),                 \
         FN(seq_write),                  \
         FN(sk_cgroup_id),               \
-       FN(sk_ancestor_cgroup_id),
+       FN(sk_ancestor_cgroup_id),      \
+       FN(ringbuf_output),             \
+       FN(ringbuf_reserve),            \
+       FN(ringbuf_submit),             \
+       FN(ringbuf_discard),            \
+       FN(ringbuf_query),
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
   * function eBPF program intends to call
@@ -3398,6 +3457,29 @@ enum {
         BPF_F_GET_BRANCH_RECORDS_SIZE   = (1ULL << 0),
  };
  
+/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
+ * BPF_FUNC_bpf_ringbuf_output flags.
+ */
+enum {
+       BPF_RB_NO_WAKEUP                = (1ULL << 0),
+       BPF_RB_FORCE_WAKEUP             = (1ULL << 1),
+};
+
+/* BPF_FUNC_bpf_ringbuf_query flags */
+enum {
+       BPF_RB_AVAIL_DATA = 0,
+       BPF_RB_RING_SIZE = 1,
+       BPF_RB_CONS_POS = 2,
+       BPF_RB_PROD_POS = 3,
+};
+
+/* BPF ring buffer constants */
+enum {
+       BPF_RINGBUF_BUSY_BIT            = (1U << 31),
+       BPF_RINGBUF_DISCARD_BIT         = (1U << 30),
+       BPF_RINGBUF_HDR_SZ              = 8,
+};
+
  /* Mode for BPF_FUNC_skb_adjust_room helper. */
  enum bpf_adj_room_mode {
         BPF_ADJ_ROOM_NET,
diff --git a/tools/testing/selftests/bpf/verifier/and.c b/tools/testing/selftests/bpf/verifier/and.c

index e0fad1548737212824cf683ce06a0073c51d02df..d781bc86e1000f2a03b6bad62bc956b575df0a90 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/and.c
+++ b/tools/testing/selftests/bpf/verifier/and.c
@@ -15,7 +15,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 3 },
-       .errstr = "R0 max value is outside of the array range",
+       .errstr = "R0 max value is outside of the allowed memory range",
         .result = REJECT,
         .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
  },
@@ -44,7 +44,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 3 },
-       .errstr = "R0 max value is outside of the array range",
+       .errstr = "R0 max value is outside of the allowed memory range",
         .result = REJECT,
         .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
  },
diff --git a/tools/testing/selftests/bpf/verifier/array_access.c b/tools/testing/selftests/bpf/verifier/array_access.c

index f3c33e128709b885ee579f08f02ea09cea7bf211..1c4b1939f5a8d8e0e83964c45fd1f47044ff3dcf 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/array_access.c
+++ b/tools/testing/selftests/bpf/verifier/array_access.c
@@ -117,7 +117,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 3 },
-       .errstr = "R0 min value is outside of the array range",
+       .errstr = "R0 min value is outside of the allowed memory range",
         .result = REJECT,
         .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
  },
@@ -137,7 +137,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 3 },
-       .errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map",
+       .errstr = "R0 unbounded memory access, make sure to bounds check any such access",
         .result = REJECT,
         .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
  },
diff --git a/tools/testing/selftests/bpf/verifier/bounds.c b/tools/testing/selftests/bpf/verifier/bounds.c

index 58f4aa593b1b527f49f3b042c16920cf4e29a082..4d6645f2874c7c505fb2a3dfe0b1c1ec196a480b 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/bounds.c
+++ b/tools/testing/selftests/bpf/verifier/bounds.c
@@ -20,7 +20,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_8b = { 3 },
-       .errstr = "R0 max value is outside of the array range",
+       .errstr = "R0 max value is outside of the allowed memory range",
         .result = REJECT,
  },
  {
@@ -146,7 +146,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_8b = { 3 },
-       .errstr = "R0 min value is outside of the array range",
+       .errstr = "R0 min value is outside of the allowed memory range",
         .result = REJECT
  },
  {
@@ -354,7 +354,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_8b = { 3 },
-       .errstr = "R0 max value is outside of the array range",
+       .errstr = "R0 max value is outside of the allowed memory range",
         .result = REJECT
  },
  {
diff --git a/tools/testing/selftests/bpf/verifier/calls.c b/tools/testing/selftests/bpf/verifier/calls.c

index 7629a0cebb9b61ab3e73ac0892b75a7c5b3a7444..94258c6b5235b702d4d93ce0780e38018f16bdd9 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/calls.c
+++ b/tools/testing/selftests/bpf/verifier/calls.c
@@ -105,7 +105,7 @@
         .prog_type = BPF_PROG_TYPE_SCHED_CLS,
         .fixup_map_hash_8b = { 16 },
         .result = REJECT,
-       .errstr = "R0 min value is outside of the array range",
+       .errstr = "R0 min value is outside of the allowed memory range",
  },
  {
         "calls: overlapping caller/callee",
diff --git a/tools/testing/selftests/bpf/verifier/direct_value_access.c b/tools/testing/selftests/bpf/verifier/direct_value_access.c

index b9fb28e8e2243da75d2ef98141fcf216517761eb..988f46a1a4c7307bc9c42d96915398e4e34f0a1a 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/direct_value_access.c
+++ b/tools/testing/selftests/bpf/verifier/direct_value_access.c
@@ -68,7 +68,7 @@
         },
         .fixup_map_array_48b = { 1 },
         .result = REJECT,
-       .errstr = "R1 min value is outside of the array range",
+       .errstr = "R1 min value is outside of the allowed memory range",
  },
  {
         "direct map access, write test 7",
@@ -220,7 +220,7 @@
         },
         .fixup_map_array_small = { 1 },
         .result = REJECT,
-       .errstr = "R1 min value is outside of the array range",
+       .errstr = "R1 min value is outside of the allowed memory range",
  },
  {
         "direct map access, write test 19",
diff --git a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c

index 67ab124100503b2d1fbd0f38187f8cb19862c5ee..5a605ae131a95471bcb9aa8ca4cf8d54da4cd5b0 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/helper_access_var_len.c
+++ b/tools/testing/selftests/bpf/verifier/helper_access_var_len.c
@@ -318,7 +318,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 4 },
-       .errstr = "R1 min value is outside of the array range",
+       .errstr = "R1 min value is outside of the allowed memory range",
         .result = REJECT,
         .prog_type = BPF_PROG_TYPE_TRACEPOINT,
  },
diff --git a/tools/testing/selftests/bpf/verifier/helper_value_access.c b/tools/testing/selftests/bpf/verifier/helper_value_access.c

index 7572e403ddb95693c2afe81bc8cfb978557a663f..961f28139b9624f388d386c3449deec622a0a5e3 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/helper_value_access.c
+++ b/tools/testing/selftests/bpf/verifier/helper_value_access.c
@@ -280,7 +280,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 3 },
-       .errstr = "R1 min value is outside of the array range",
+       .errstr = "R1 min value is outside of the allowed memory range",
         .result = REJECT,
         .prog_type = BPF_PROG_TYPE_TRACEPOINT,
  },
@@ -415,7 +415,7 @@
         BPF_EXIT_INSN(),
         },
         .fixup_map_hash_48b = { 3 },
-       .errstr = "R1 min value is outside of the array range",
+       .errstr = "R1 min value is outside of the allowed memory range",
         .result = REJECT,
         .prog_type = BPF_PROG_TYPE_TRACEPOINT,
  },
@@ -926,7 +926,7 @@
         },
         .fixup_map_hash_16b = { 3, 10 },
         .result = REJECT,
-       .errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map",
+       .errstr = "R2 unbounded memory access, make sure to bounds check any such access",
         .prog_type = BPF_PROG_TYPE_TRACEPOINT,
  },
  {
diff --git a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c

index a53d99cebd9ffff7db1c1065b74135a786b9f280..97ee658e1242461eb8ad99c4b5523b0196482186 100644 (file)
--- a/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
+++ b/tools/testing/selftests/bpf/verifier/value_ptr_arith.c
@@ -50,7 +50,7 @@
         .fixup_map_array_48b = { 8 },
         .result = ACCEPT,
         .result_unpriv = REJECT,
-       .errstr_unpriv = "R0 min value is outside of the array range",
+       .errstr_unpriv = "R0 min value is outside of the allowed memory range",
         .retval = 1,
  },
  {
@@ -325,7 +325,7 @@
         },
         .fixup_map_array_48b = { 3 },
         .result = REJECT,
-       .errstr = "R0 min value is outside of the array range",
+       .errstr = "R0 min value is outside of the allowed memory range",
         .result_unpriv = REJECT,
         .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
  },
@@ -601,7 +601,7 @@
         },
         .fixup_map_array_48b = { 3 },
         .result = REJECT,
-       .errstr = "R1 max value is outside of the array range",
+       .errstr = "R1 max value is outside of the allowed memory range",
         .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range",
         .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
  },
@@ -726,7 +726,7 @@
         },
         .fixup_map_array_48b = { 3 },
         .result = REJECT,
-       .errstr = "R0 min value is outside of the array range",
+       .errstr = "R0 min value is outside of the allowed memory range",
  },
  {
         "map access: value_ptr -= known scalar, 2",
author	Andrii Nakryiko <andriin@fb.com>
	Fri, 29 May 2020 07:54:20 +0000 (00:54 -0700)
committer	Alexei Starovoitov <ast@kernel.org>
	Mon, 1 Jun 2020 21:38:22 +0000 (14:38 -0700)
include/linux/bpf.h		patch \| blob \| history
include/linux/bpf_types.h		patch \| blob \| history
include/linux/bpf_verifier.h		patch \| blob \| history
include/uapi/linux/bpf.h		patch \| blob \| history
kernel/bpf/Makefile		patch \| blob \| history
kernel/bpf/helpers.c		patch \| blob \| history
kernel/bpf/ringbuf.c	[new file with mode: 0644]	patch \| blob
kernel/bpf/syscall.c		patch \| blob \| history
kernel/bpf/verifier.c		patch \| blob \| history
kernel/trace/bpf_trace.c		patch \| blob \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| history
tools/testing/selftests/bpf/verifier/and.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/array_access.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/bounds.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/calls.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/direct_value_access.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/helper_access_var_len.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/helper_value_access.c		patch \| blob \| history
tools/testing/selftests/bpf/verifier/value_ptr_arith.c		patch \| blob \| history