Эх сурвалжийг харах

net: Allow accepted sockets to be bound to l3mdev domain

Allow accepted sockets to derive their sk_bound_dev_if setting from the
l3mdev domain in which the packets originated. A sysctl setting is added
to control the behavior which is similar to sk_mark and
sysctl_tcp_fwmark_accept.

This effectively allow a process to have a "VRF-global" listen socket,
with child sockets bound to the VRF device in which the packet originated.
A similar behavior can be achieved using sk_mark, but a solution using marks
is incomplete as it does not handle duplicate addresses in different L3
domains/VRFs. Allowing sockets to inherit the sk_bound_dev_if from l3mdev
domain provides a complete solution.

Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
David Ahern 9 жил өмнө
parent
commit
6dd9a14e92

+ 8 - 0
Documentation/networking/ip-sysctl.txt

@@ -335,6 +335,14 @@ tcp_keepalive_intvl - INTEGER
 	after probes started. Default value: 75sec i.e. connection
 	after probes started. Default value: 75sec i.e. connection
 	will be aborted after ~11 minutes of retries.
 	will be aborted after ~11 minutes of retries.
 
 
+tcp_l3mdev_accept - BOOLEAN
+	Enables child sockets to inherit the L3 master device index.
+	Enabling this option allows a "global" listen socket to work
+	across L3 master domains (e.g., VRFs) with connected sockets
+	derived from the listen socket to be bound to the L3 domain in
+	which the packets originated. Only valid when the kernel was
+	compiled with CONFIG_NET_L3_MASTER_DEV.
+
 tcp_low_latency - BOOLEAN
 tcp_low_latency - BOOLEAN
 	If set, the TCP stack makes decisions that prefer lower
 	If set, the TCP stack makes decisions that prefer lower
 	latency as opposed to higher throughput.  By default, this
 	latency as opposed to higher throughput.  By default, this

+ 14 - 0
include/net/inet_sock.h

@@ -28,6 +28,7 @@
 #include <net/request_sock.h>
 #include <net/request_sock.h>
 #include <net/netns/hash.h>
 #include <net/netns/hash.h>
 #include <net/tcp_states.h>
 #include <net/tcp_states.h>
+#include <net/l3mdev.h>
 
 
 /** struct ip_options - IP Options
 /** struct ip_options - IP Options
  *
  *
@@ -113,6 +114,19 @@ static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb)
 	return sk->sk_mark;
 	return sk->sk_mark;
 }
 }
 
 
+static inline int inet_request_bound_dev_if(const struct sock *sk,
+					    struct sk_buff *skb)
+{
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	struct net *net = sock_net(sk);
+
+	if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept)
+		return l3mdev_master_ifindex_by_index(net, skb->skb_iif);
+#endif
+
+	return sk->sk_bound_dev_if;
+}
+
 struct inet_cork {
 struct inet_cork {
 	unsigned int		flags;
 	unsigned int		flags;
 	__be32			addr;
 	__be32			addr;

+ 3 - 0
include/net/netns/ipv4.h

@@ -86,6 +86,9 @@ struct netns_ipv4 {
 
 
 	int sysctl_fwmark_reflect;
 	int sysctl_fwmark_reflect;
 	int sysctl_tcp_fwmark_accept;
 	int sysctl_tcp_fwmark_accept;
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	int sysctl_tcp_l3mdev_accept;
+#endif
 	int sysctl_tcp_mtu_probing;
 	int sysctl_tcp_mtu_probing;
 	int sysctl_tcp_base_mss;
 	int sysctl_tcp_base_mss;
 	int sysctl_tcp_probe_threshold;
 	int sysctl_tcp_probe_threshold;

+ 2 - 2
net/ipv4/syncookies.c

@@ -351,7 +351,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	treq->snt_synack.v64	= 0;
 	treq->snt_synack.v64	= 0;
 	treq->tfo_listener	= false;
 	treq->tfo_listener	= false;
 
 
-	ireq->ir_iif = sk->sk_bound_dev_if;
+	ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
 
 
 	/* We throwed the options of the initial SYN away, so we hope
 	/* We throwed the options of the initial SYN away, so we hope
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
 	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -371,7 +371,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
 	 * hasn't changed since we received the original syn, but I see
 	 * hasn't changed since we received the original syn, but I see
 	 * no easy way to do this.
 	 * no easy way to do this.
 	 */
 	 */
-	flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+	flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
 			   inet_sk_flowi_flags(sk),
 			   inet_sk_flowi_flags(sk),
 			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,
 			   opt->srr ? opt->faddr : ireq->ir_rmt_addr,

+ 11 - 0
net/ipv4/sysctl_net_ipv4.c

@@ -915,6 +915,17 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 		.proc_handler	= proc_dointvec,
 	},
 	},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	{
+		.procname	= "tcp_l3mdev_accept",
+		.data		= &init_net.ipv4.sysctl_tcp_l3mdev_accept,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{
 	{
 		.procname	= "tcp_mtu_probing",
 		.procname	= "tcp_mtu_probing",
 		.data		= &init_net.ipv4.sysctl_tcp_mtu_probing,
 		.data		= &init_net.ipv4.sysctl_tcp_mtu_probing,

+ 1 - 1
net/ipv4/tcp_input.c

@@ -6204,7 +6204,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 	tcp_openreq_init(req, &tmp_opt, skb, sk);
 
 
 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
 	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
-	inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
 
 
 	af_ops->init_req(req, sk, skb);
 	af_ops->init_req(req, sk, skb);
 
 

+ 1 - 0
net/ipv4/tcp_ipv4.c

@@ -1276,6 +1276,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 	ireq		      = inet_rsk(req);
 	ireq		      = inet_rsk(req);
 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
 	sk_daddr_set(newsk, ireq->ir_rmt_addr);
 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
 	sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+	newsk->sk_bound_dev_if = ireq->ir_iif;
 	newinet->inet_saddr	      = ireq->ir_loc_addr;
 	newinet->inet_saddr	      = ireq->ir_loc_addr;
 	inet_opt	      = ireq->opt;
 	inet_opt	      = ireq->opt;
 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
 	rcu_assign_pointer(newinet->inet_opt, inet_opt);

+ 2 - 2
net/ipv6/syncookies.c

@@ -193,7 +193,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		ireq->pktopts = skb;
 		ireq->pktopts = skb;
 	}
 	}
 
 
-	ireq->ir_iif = sk->sk_bound_dev_if;
+	ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
 	/* So that link locals have meaning */
 	/* So that link locals have meaning */
 	if (!sk->sk_bound_dev_if &&
 	if (!sk->sk_bound_dev_if &&
 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
 	    ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
@@ -224,7 +224,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 		fl6.daddr = ireq->ir_v6_rmt_addr;
 		fl6.daddr = ireq->ir_v6_rmt_addr;
 		final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
 		final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
 		fl6.saddr = ireq->ir_v6_loc_addr;
 		fl6.saddr = ireq->ir_v6_loc_addr;
-		fl6.flowi6_oif = sk->sk_bound_dev_if;
+		fl6.flowi6_oif = ireq->ir_iif;
 		fl6.flowi6_mark = ireq->ir_mark;
 		fl6.flowi6_mark = ireq->ir_mark;
 		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_dport = ireq->ir_rmt_port;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;
 		fl6.fl6_sport = inet_sk(sk)->inet_sport;