net: better IFF_XMIT_DST_RELEASE support

Testing xmit_more support with netperf and connected UDP sockets,
I found strange dst refcount false sharing.

Current handling of IFF_XMIT_DST_RELEASE is not optimal.

Dropping dst in validate_xmit_skb() is certainly too late in case
packet was queued by cpu X but dequeued by cpu Y

The logical point to take care of drop/force is in __dev_queue_xmit()
before even taking qdisc lock.

As Julian Anastasov pointed out, need for skb_dst() might come from some
packet schedulers or classifiers.

This patch adds new helper to cleanly express needs of various drivers
or qdiscs/classifiers.

Drivers that need skb_dst() in their ndo_start_xmit() should call
following helper in their setup instead of the prior :

	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
->
	netif_keep_dst(dev);

Instead of using a single bit, we use two bits, one being
eventually rebuilt in bonding/team drivers.

The other one, is permanent and blocks IFF_XMIT_DST_RELEASE being
rebuilt in bonding/team. Eventually, we could add something
smarter later.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2014-10-05 18:38:35 -07:00 committed by David S. Miller
parent fe971b95c2
commit 0287587884
27 changed files with 54 additions and 39 deletions

View file

@ -1364,7 +1364,7 @@ void ipoib_setup(struct net_device *dev)
dev->tx_queue_len = ipoib_sendq_size * 2;
dev->features = (NETIF_F_VLAN_CHALLENGED |
NETIF_F_HIGHDMA);
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);

View file

@ -74,7 +74,7 @@ static struct net_device * __init ipddp_init(void)
if (!dev)
return ERR_PTR(-ENOMEM);
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
strcpy(dev->name, "ipddp%d");
if (version_printed++ == 0)

View file

@ -1002,7 +1002,8 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
static void bond_compute_features(struct bonding *bond)
{
unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
IFF_XMIT_DST_RELEASE_PERM;
netdev_features_t vlan_features = BOND_VLAN_FEATURES;
netdev_features_t enc_features = BOND_ENC_FEATURES;
struct net_device *bond_dev = bond->dev;
@ -1038,8 +1039,10 @@ done:
bond_dev->gso_max_segs = gso_max_segs;
netif_set_gso_max_size(bond_dev, gso_max_size);
flags = bond_dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
bond_dev->priv_flags = flags | dst_release_flag;
bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) &&
dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE;
netdev_change_features(bond_dev);
}

View file

@ -199,7 +199,7 @@ static void __init eql_setup(struct net_device *dev)
dev->type = ARPHRD_SLIP;
dev->tx_queue_len = 5; /* Hands them off fast */
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
static int eql_open(struct net_device *dev)

View file

@ -185,7 +185,8 @@ static void ifb_setup(struct net_device *dev)
dev->flags |= IFF_NOARP;
dev->flags &= ~IFF_MULTICAST;
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
netif_keep_dst(dev);
eth_hw_addr_random(dev);
}

View file

@ -169,7 +169,7 @@ static void loopback_setup(struct net_device *dev)
dev->type = ARPHRD_LOOPBACK; /* 0x0001*/
dev->flags = IFF_LOOPBACK;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
dev->hw_features = NETIF_F_ALL_TSO | NETIF_F_UFO;
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST
| NETIF_F_ALL_TSO

View file

@ -1025,7 +1025,8 @@ void macvlan_common_setup(struct net_device *dev)
{
ether_setup(dev);
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
netif_keep_dst(dev);
dev->priv_flags |= IFF_UNICAST_FLT;
dev->netdev_ops = &macvlan_netdev_ops;
dev->destructor = free_netdev;

View file

@ -1103,7 +1103,7 @@ static void ppp_setup(struct net_device *dev)
dev->type = ARPHRD_PPP;
dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
dev->features |= NETIF_F_NETNS_LOCAL;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
/*

View file

@ -970,7 +970,8 @@ static void __team_compute_features(struct team *team)
struct team_port *port;
u32 vlan_features = TEAM_VLAN_FEATURES & NETIF_F_ALL_FOR_ALL;
unsigned short max_hard_header_len = ETH_HLEN;
unsigned int flags, dst_release_flag = IFF_XMIT_DST_RELEASE;
unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE |
IFF_XMIT_DST_RELEASE_PERM;
list_for_each_entry(port, &team->port_list, list) {
vlan_features = netdev_increment_features(vlan_features,
@ -985,8 +986,9 @@ static void __team_compute_features(struct team *team)
team->dev->vlan_features = vlan_features;
team->dev->hard_header_len = max_hard_header_len;
flags = team->dev->priv_flags & ~IFF_XMIT_DST_RELEASE;
team->dev->priv_flags = flags | dst_release_flag;
team->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
if (dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM))
team->dev->priv_flags |= IFF_XMIT_DST_RELEASE;
netdev_change_features(team->dev);
}

View file

@ -2193,7 +2193,7 @@ static void vxlan_setup(struct net_device *dev)
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
INIT_LIST_HEAD(&vxlan->next);

View file

@ -1047,7 +1047,7 @@ static void pvc_setup(struct net_device *dev)
dev->flags = IFF_POINTOPOINT;
dev->hard_header_len = 10;
dev->addr_len = 2;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
static const struct net_device_ops pvc_ops = {

View file

@ -3306,7 +3306,7 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
card->dev->features |= NETIF_F_HW_VLAN_CTAG_TX |
NETIF_F_HW_VLAN_CTAG_RX |
NETIF_F_HW_VLAN_CTAG_FILTER;
card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(card->dev);
card->dev->gso_max_size = 15 * PAGE_SIZE;
SET_NETDEV_DEV(card->dev, &card->gdev->dev);

View file

@ -1206,6 +1206,7 @@ enum netdev_priv_flags {
IFF_SUPP_NOFCS = 1<<19,
IFF_LIVE_ADDR_CHANGE = 1<<20,
IFF_MACVLAN = 1<<21,
IFF_XMIT_DST_RELEASE_PERM = 1<<22,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
@ -1230,6 +1231,7 @@ enum netdev_priv_flags {
#define IFF_SUPP_NOFCS IFF_SUPP_NOFCS
#define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE
#define IFF_MACVLAN IFF_MACVLAN
#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM
/**
* struct net_device - The DEVICE structure.
@ -3588,6 +3590,12 @@ static inline bool netif_supports_nofcs(struct net_device *dev)
return dev->priv_flags & IFF_SUPP_NOFCS;
}
/* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
static inline void netif_keep_dst(struct net_device *dev)
{
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM);
}
extern struct pernet_operations __net_initdata loopback_net_ops;
/* Logging, debugging and troubleshooting/diagnostic helpers. */

View file

@ -799,7 +799,8 @@ void vlan_setup(struct net_device *dev)
ether_setup(dev);
dev->priv_flags |= IFF_802_1Q_VLAN;
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
netif_keep_dst(dev);
dev->tx_queue_len = 0;
dev->netdev_ops = &vlan_netdev_ops;

View file

@ -501,7 +501,7 @@ static void clip_setup(struct net_device *dev)
/* without any more elaborate queuing. 100 is a reasonable */
/* compromise between decent burst-tolerance and protection */
/* against memory hogs. */
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
static int clip_create(int number)

View file

@ -2665,12 +2665,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
if (skb->next)
return skb;
/* If device doesn't need skb->dst, release it right now while
* its hot in this cpu cache
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@ -2811,8 +2805,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
* waiting to be sent out; and the qdisc is not running -
* xmit the skb directly.
*/
if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
skb_dst_force(skb);
qdisc_bstats_update(q, skb);
@ -2827,7 +2819,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
rc = NET_XMIT_SUCCESS;
} else {
skb_dst_force(skb);
rc = q->enqueue(skb, q) & NET_XMIT_MASK;
if (qdisc_run_begin(q)) {
if (unlikely(contended)) {
@ -2924,6 +2915,14 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_update_prio(skb);
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
skb_dst_drop(skb);
else
skb_dst_force(skb);
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
@ -6674,7 +6673,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->all_adj_list.upper);
INIT_LIST_HEAD(&dev->all_adj_list.lower);
dev->priv_flags = IFF_XMIT_DST_RELEASE;
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
dev->num_tx_queues = txqs;

View file

@ -510,7 +510,7 @@ static int ipgre_tunnel_init(struct net_device *dev)
memcpy(dev->broadcast, &iph->daddr, 4);
dev->flags = IFF_NOARP;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
dev->addr_len = 4;
if (iph->daddr) {

View file

@ -364,7 +364,7 @@ static int vti_tunnel_init(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
return ip_tunnel_init(dev);
}

View file

@ -289,7 +289,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
dev->features |= IPIP_FEATURES;
dev->hw_features |= IPIP_FEATURES;

View file

@ -1242,7 +1242,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
dev->flags |= IFF_NOARP;
dev->iflink = 0;
dev->addr_len = sizeof(struct in6_addr);
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
static int ip6gre_tunnel_init(struct net_device *dev)

View file

@ -1493,7 +1493,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev)
dev->mtu -= 8;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
/* This perm addr will be used as interface identifier by IPv6 */
dev->addr_assign_type = NET_ADDR_RANDOM;
eth_random_addr(dev->perm_addr);

View file

@ -807,7 +807,7 @@ static void vti6_dev_setup(struct net_device *dev)
dev->mtu = ETH_DATA_LEN;
dev->flags |= IFF_NOARP;
dev->addr_len = sizeof(struct in6_addr);
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
/**

View file

@ -1364,7 +1364,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
dev->hard_header_len = LL_MAX_HEADER + t_hlen;
dev->mtu = ETH_DATA_LEN - t_hlen;
dev->flags = IFF_NOARP;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
dev->iflink = 0;
dev->addr_len = 4;
dev->features |= NETIF_F_LLTX;

View file

@ -493,6 +493,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
tcf_exts_change(tp, &fnew->exts, &e);
tcf_em_tree_change(tp, &fnew->ematches, &t);
netif_keep_dst(qdisc_dev(tp->q));
if (tb[TCA_FLOW_KEYS]) {
fnew->keymask = keymask;
fnew->nkeys = nkeys;

View file

@ -524,6 +524,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb,
if (f->handle < f1->handle)
break;
netif_keep_dst(qdisc_dev(tp->q));
rcu_assign_pointer(f->next, f1);
rcu_assign_pointer(*fp, f);

View file

@ -47,7 +47,6 @@ EXPORT_SYMBOL(default_qdisc_ops);
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
skb_dst_force(skb);
q->gso_skb = skb;
q->qstats.requeues++;
q->q.qlen++; /* it's still part of the queue */
@ -218,8 +217,6 @@ static inline int qdisc_restart(struct Qdisc *q)
if (unlikely(!skb))
return 0;
WARN_ON_ONCE(skb_dst_is_noref(skb));
root_lock = qdisc_lock(q);
dev = qdisc_dev(q);
txq = skb_get_tx_queue(dev, skb);

View file

@ -470,7 +470,7 @@ static __init void teql_master_setup(struct net_device *dev)
dev->tx_queue_len = 100;
dev->flags = IFF_NOARP;
dev->hard_header_len = LL_MAX_HEADER;
dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
netif_keep_dst(dev);
}
static LIST_HEAD(master_dev_list);