linux-hardened/net/core/sock_reuseport.c
Martin KaFai Lau 40a1227ea8 tcp: Avoid TCP syncookie rejected by SO_REUSEPORT socket
Although the actual cookie check "__cookie_v[46]_check()" does
not involve sk specific info, it checks whether the sk has recent
synq overflow event in "tcp_synq_no_recent_overflow()".  The
tcp_sk(sk)->rx_opt.ts_recent_stamp is updated every second
when it has sent out a syncookie (through "tcp_synq_overflow()").

The above per sk "recent synq overflow event timestamp" works well
for non SO_REUSEPORT use case.  However, it may cause random
connection request reject/discard when SO_REUSEPORT is used with
syncookie because it fails the "tcp_synq_no_recent_overflow()"
test.

When SO_REUSEPORT is used, it usually has multiple listening
socks serving TCP connection requests destinated to the same local IP:PORT.
There are cases that the TCP-ACK-COOKIE may not be received
by the same sk that sent out the syncookie.  For example,
if reuse->socks[] began with {sk0, sk1},
1) sk1 sent out syncookies and tcp_sk(sk1)->rx_opt.ts_recent_stamp
   was updated.
2) the reuse->socks[] became {sk1, sk2} later.  e.g. sk0 was first closed
   and then sk2 was added.  Here, sk2 does not have ts_recent_stamp set.
   There are other ordering that will trigger the similar situation
   below but the idea is the same.
3) When the TCP-ACK-COOKIE comes back, sk2 was selected.
   "tcp_synq_no_recent_overflow(sk2)" returns true. In this case,
   all syncookies sent by sk1 will be handled (and rejected)
   by sk2 while sk1 is still alive.

The userspace may create and remove listening SO_REUSEPORT sockets
as it sees fit.  E.g. Adding new thread (and SO_REUSEPORT sock) to handle
incoming requests, old process stopping and new process starting...etc.
With or without SO_ATTACH_REUSEPORT_[CB]BPF,
the sockets leaving and joining a reuseport group makes picking
the same sk to check the syncookie very difficult (if not impossible).

The later patches will allow bpf prog more flexibility in deciding
where a sk should be located in a bpf map and selecting a particular
SO_REUSEPORT sock as it sees fit.  e.g. Without closing any sock,
replace the whole bpf reuseport_array in one map_update() by using
map-in-map.  Getting the syncookie check working smoothly across
socks in the same "reuse->socks[]" is important.

A partial solution is to set the newly added sk's ts_recent_stamp
to the max ts_recent_stamp of a reuseport group but that will require
to iterate through reuse->socks[]  OR
pessimistically set it to "now - TCP_SYNCOOKIE_VALID" when a sk is
joining a reuseport group.  However, neither of them will solve the
existing sk getting moved around the reuse->socks[] and that
sk may not have ts_recent_stamp updated, unlikely under continuous
synflood but not impossible.

This patch opts to treat the reuseport group as a whole when
considering the last synq overflow timestamp since
they are serving the same IP:PORT from the userspace
(and BPF program) perspective.

"synq_overflow_ts" is added to "struct sock_reuseport".
The tcp_synq_overflow() and tcp_synq_no_recent_overflow()
will update/check reuse->synq_overflow_ts if the sk is
in a reuseport group.  Similar to the reuseport decision in
__inet_lookup_listener(), both sk->sk_reuseport and
sk->sk_reuseport_cb are tested for SO_REUSEPORT usage.
Update on "synq_overflow_ts" happens at roughly once
every second.

A synflood test was done with a 16 rx-queues and 16 reuseport sockets.
No meaningful performance change is observed.  Before and
after the change is ~9Mpps in IPv4.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
2018-08-11 01:58:45 +02:00

272 lines
6.8 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* To speed up listener socket lookup, create an array to store all sockets
* listening on the same port. This allows a decision to be made after finding
* the first socket. An optional BPF program can also be configured for
* selecting the socket index from the array of available sockets.
*/
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
static DEFINE_SPINLOCK(reuseport_lock);
static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks)
{
unsigned int size = sizeof(struct sock_reuseport) +
sizeof(struct sock *) * max_socks;
struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
if (!reuse)
return NULL;
reuse->max_socks = max_socks;
RCU_INIT_POINTER(reuse->prog, NULL);
return reuse;
}
int reuseport_alloc(struct sock *sk)
{
struct sock_reuseport *reuse;
/* bh lock used since this function call may precede hlist lock in
* soft irq of receive path or setsockopt from process context
*/
spin_lock_bh(&reuseport_lock);
/* Allocation attempts can occur concurrently via the setsockopt path
* and the bind/hash path. Nothing to do when we lose the race.
*/
if (rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock)))
goto out;
reuse = __reuseport_alloc(INIT_SOCKS);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
return -ENOMEM;
}
reuse->socks[0] = sk;
reuse->num_socks = 1;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
out:
spin_unlock_bh(&reuseport_lock);
return 0;
}
EXPORT_SYMBOL(reuseport_alloc);
static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
{
struct sock_reuseport *more_reuse;
u32 more_socks_size, i;
more_socks_size = reuse->max_socks * 2U;
if (more_socks_size > U16_MAX)
return NULL;
more_reuse = __reuseport_alloc(more_socks_size);
if (!more_reuse)
return NULL;
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
more_reuse->synq_overflow_ts = READ_ONCE(reuse->synq_overflow_ts);
for (i = 0; i < reuse->num_socks; ++i)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
* that reuse and more_reuse can temporarily share a reference
* to prog.
*/
kfree_rcu(reuse, rcu);
return more_reuse;
}
static void reuseport_free_rcu(struct rcu_head *head)
{
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
if (reuse->prog)
bpf_prog_destroy(reuse->prog);
kfree(reuse);
}
/**
* reuseport_add_sock - Add a socket to the reuseport group of another.
* @sk: New socket to add to the group.
* @sk2: Socket belonging to the existing reuseport group.
* May return ENOMEM and not add socket to group under memory pressure.
*/
int reuseport_add_sock(struct sock *sk, struct sock *sk2)
{
struct sock_reuseport *old_reuse, *reuse;
if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
int err = reuseport_alloc(sk2);
if (err)
return err;
}
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
if (old_reuse && old_reuse->num_socks != 1) {
spin_unlock_bh(&reuseport_lock);
return -EBUSY;
}
if (reuse->num_socks == reuse->max_socks) {
reuse = reuseport_grow(reuse);
if (!reuse) {
spin_unlock_bh(&reuseport_lock);
return -ENOMEM;
}
}
reuse->socks[reuse->num_socks] = sk;
/* paired with smp_rmb() in reuseport_select_sock() */
smp_wmb();
reuse->num_socks++;
rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
spin_unlock_bh(&reuseport_lock);
if (old_reuse)
call_rcu(&old_reuse->rcu, reuseport_free_rcu);
return 0;
}
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
int i;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
for (i = 0; i < reuse->num_socks; i++) {
if (reuse->socks[i] == sk) {
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
if (reuse->num_socks == 0)
call_rcu(&reuse->rcu, reuseport_free_rcu);
break;
}
}
spin_unlock_bh(&reuseport_lock);
}
EXPORT_SYMBOL(reuseport_detach_sock);
static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
if (skb_shared(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return NULL;
skb = nskb;
}
/* temporarily advance data past protocol header */
if (!pskb_pull(skb, hdr_len)) {
kfree_skb(nskb);
return NULL;
}
index = bpf_prog_run_save_cb(prog, skb);
__skb_push(skb, hdr_len);
consume_skb(nskb);
if (index >= socks)
return NULL;
return reuse->socks[index];
}
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
* @hash: When no BPF filter is available, use this hash to select.
* @skb: skb to run through BPF filter.
* @hdr_len: BPF filter expects skb data pointer at payload data. If
* the skb does not yet point at the payload, this parameter represents
* how far the pointer needs to advance to reach the payload.
* Returns a socket that should receive the packet (or NULL on error).
*/
struct sock *reuseport_select_sock(struct sock *sk,
u32 hash,
struct sk_buff *skb,
int hdr_len)
{
struct sock_reuseport *reuse;
struct bpf_prog *prog;
struct sock *sk2 = NULL;
u16 socks;
rcu_read_lock();
reuse = rcu_dereference(sk->sk_reuseport_cb);
/* if memory allocation failed or add call is not yet complete */
if (!reuse)
goto out;
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
if (prog && skb)
sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
/* no bpf or invalid bpf result: fall back to hash usage */
if (!sk2)
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
}
out:
rcu_read_unlock();
return sk2;
}
EXPORT_SYMBOL(reuseport_select_sock);
struct bpf_prog *
reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_prog = rcu_dereference_protected(reuse->prog,
lockdep_is_held(&reuseport_lock));
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
return old_prog;
}
EXPORT_SYMBOL(reuseport_attach_prog);