veth.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268
  1. /*
  2. * drivers/net/veth.c
  3. *
  4. * Copyright (C) 2007 OpenVZ http://openvz.org, SWsoft Inc
  5. *
  6. * Author: Pavel Emelianov <xemul@openvz.org>
  7. * Ethtool interface from: Eric W. Biederman <ebiederm@xmission.com>
  8. *
  9. */
  10. #include <linux/netdevice.h>
  11. #include <linux/slab.h>
  12. #include <linux/ethtool.h>
  13. #include <linux/etherdevice.h>
  14. #include <linux/u64_stats_sync.h>
  15. #include <net/rtnetlink.h>
  16. #include <net/dst.h>
  17. #include <net/xfrm.h>
  18. #include <net/xdp.h>
  19. #include <linux/veth.h>
  20. #include <linux/module.h>
  21. #include <linux/bpf.h>
  22. #include <linux/filter.h>
  23. #include <linux/ptr_ring.h>
  24. #include <linux/bpf_trace.h>
  25. #define DRV_NAME "veth"
  26. #define DRV_VERSION "1.0"
  27. #define VETH_XDP_FLAG BIT(0)
  28. #define VETH_RING_SIZE 256
  29. #define VETH_XDP_HEADROOM (XDP_PACKET_HEADROOM + NET_IP_ALIGN)
  30. /* Separating two types of XDP xmit */
  31. #define VETH_XDP_TX BIT(0)
  32. #define VETH_XDP_REDIR BIT(1)
  33. struct pcpu_vstats {
  34. u64 packets;
  35. u64 bytes;
  36. struct u64_stats_sync syncp;
  37. };
  38. struct veth_rq {
  39. struct napi_struct xdp_napi;
  40. struct net_device *dev;
  41. struct bpf_prog __rcu *xdp_prog;
  42. struct xdp_mem_info xdp_mem;
  43. bool rx_notify_masked;
  44. struct ptr_ring xdp_ring;
  45. struct xdp_rxq_info xdp_rxq;
  46. };
  47. struct veth_priv {
  48. struct net_device __rcu *peer;
  49. atomic64_t dropped;
  50. struct bpf_prog *_xdp_prog;
  51. struct veth_rq *rq;
  52. unsigned int requested_headroom;
  53. };
  54. /*
  55. * ethtool interface
  56. */
  57. static struct {
  58. const char string[ETH_GSTRING_LEN];
  59. } ethtool_stats_keys[] = {
  60. { "peer_ifindex" },
  61. };
  62. static int veth_get_link_ksettings(struct net_device *dev,
  63. struct ethtool_link_ksettings *cmd)
  64. {
  65. cmd->base.speed = SPEED_10000;
  66. cmd->base.duplex = DUPLEX_FULL;
  67. cmd->base.port = PORT_TP;
  68. cmd->base.autoneg = AUTONEG_DISABLE;
  69. return 0;
  70. }
  71. static void veth_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
  72. {
  73. strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
  74. strlcpy(info->version, DRV_VERSION, sizeof(info->version));
  75. }
  76. static void veth_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
  77. {
  78. switch(stringset) {
  79. case ETH_SS_STATS:
  80. memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
  81. break;
  82. }
  83. }
  84. static int veth_get_sset_count(struct net_device *dev, int sset)
  85. {
  86. switch (sset) {
  87. case ETH_SS_STATS:
  88. return ARRAY_SIZE(ethtool_stats_keys);
  89. default:
  90. return -EOPNOTSUPP;
  91. }
  92. }
  93. static void veth_get_ethtool_stats(struct net_device *dev,
  94. struct ethtool_stats *stats, u64 *data)
  95. {
  96. struct veth_priv *priv = netdev_priv(dev);
  97. struct net_device *peer = rtnl_dereference(priv->peer);
  98. data[0] = peer ? peer->ifindex : 0;
  99. }
  100. static const struct ethtool_ops veth_ethtool_ops = {
  101. .get_drvinfo = veth_get_drvinfo,
  102. .get_link = ethtool_op_get_link,
  103. .get_strings = veth_get_strings,
  104. .get_sset_count = veth_get_sset_count,
  105. .get_ethtool_stats = veth_get_ethtool_stats,
  106. .get_link_ksettings = veth_get_link_ksettings,
  107. };
  108. /* general routines */
  109. static bool veth_is_xdp_frame(void *ptr)
  110. {
  111. return (unsigned long)ptr & VETH_XDP_FLAG;
  112. }
  113. static void *veth_ptr_to_xdp(void *ptr)
  114. {
  115. return (void *)((unsigned long)ptr & ~VETH_XDP_FLAG);
  116. }
  117. static void *veth_xdp_to_ptr(void *ptr)
  118. {
  119. return (void *)((unsigned long)ptr | VETH_XDP_FLAG);
  120. }
  121. static void veth_ptr_free(void *ptr)
  122. {
  123. if (veth_is_xdp_frame(ptr))
  124. xdp_return_frame(veth_ptr_to_xdp(ptr));
  125. else
  126. kfree_skb(ptr);
  127. }
  128. static void __veth_xdp_flush(struct veth_rq *rq)
  129. {
  130. /* Write ptr_ring before reading rx_notify_masked */
  131. smp_mb();
  132. if (!rq->rx_notify_masked) {
  133. rq->rx_notify_masked = true;
  134. napi_schedule(&rq->xdp_napi);
  135. }
  136. }
  137. static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
  138. {
  139. if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
  140. dev_kfree_skb_any(skb);
  141. return NET_RX_DROP;
  142. }
  143. return NET_RX_SUCCESS;
  144. }
  145. static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
  146. struct veth_rq *rq, bool xdp)
  147. {
  148. return __dev_forward_skb(dev, skb) ?: xdp ?
  149. veth_xdp_rx(rq, skb) :
  150. netif_rx(skb);
  151. }
  152. static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
  153. {
  154. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  155. struct veth_rq *rq = NULL;
  156. struct net_device *rcv;
  157. int length = skb->len;
  158. bool rcv_xdp = false;
  159. int rxq;
  160. rcu_read_lock();
  161. rcv = rcu_dereference(priv->peer);
  162. if (unlikely(!rcv)) {
  163. kfree_skb(skb);
  164. goto drop;
  165. }
  166. rcv_priv = netdev_priv(rcv);
  167. rxq = skb_get_queue_mapping(skb);
  168. if (rxq < rcv->real_num_rx_queues) {
  169. rq = &rcv_priv->rq[rxq];
  170. rcv_xdp = rcu_access_pointer(rq->xdp_prog);
  171. if (rcv_xdp)
  172. skb_record_rx_queue(skb, rxq);
  173. }
  174. if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
  175. struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
  176. u64_stats_update_begin(&stats->syncp);
  177. stats->bytes += length;
  178. stats->packets++;
  179. u64_stats_update_end(&stats->syncp);
  180. } else {
  181. drop:
  182. atomic64_inc(&priv->dropped);
  183. }
  184. if (rcv_xdp)
  185. __veth_xdp_flush(rq);
  186. rcu_read_unlock();
  187. return NETDEV_TX_OK;
  188. }
  189. static u64 veth_stats_one(struct pcpu_vstats *result, struct net_device *dev)
  190. {
  191. struct veth_priv *priv = netdev_priv(dev);
  192. int cpu;
  193. result->packets = 0;
  194. result->bytes = 0;
  195. for_each_possible_cpu(cpu) {
  196. struct pcpu_vstats *stats = per_cpu_ptr(dev->vstats, cpu);
  197. u64 packets, bytes;
  198. unsigned int start;
  199. do {
  200. start = u64_stats_fetch_begin_irq(&stats->syncp);
  201. packets = stats->packets;
  202. bytes = stats->bytes;
  203. } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
  204. result->packets += packets;
  205. result->bytes += bytes;
  206. }
  207. return atomic64_read(&priv->dropped);
  208. }
  209. static void veth_get_stats64(struct net_device *dev,
  210. struct rtnl_link_stats64 *tot)
  211. {
  212. struct veth_priv *priv = netdev_priv(dev);
  213. struct net_device *peer;
  214. struct pcpu_vstats one;
  215. tot->tx_dropped = veth_stats_one(&one, dev);
  216. tot->tx_bytes = one.bytes;
  217. tot->tx_packets = one.packets;
  218. rcu_read_lock();
  219. peer = rcu_dereference(priv->peer);
  220. if (peer) {
  221. tot->rx_dropped = veth_stats_one(&one, peer);
  222. tot->rx_bytes = one.bytes;
  223. tot->rx_packets = one.packets;
  224. }
  225. rcu_read_unlock();
  226. }
  227. /* fake multicast ability */
  228. static void veth_set_multicast_list(struct net_device *dev)
  229. {
  230. }
  231. static struct sk_buff *veth_build_skb(void *head, int headroom, int len,
  232. int buflen)
  233. {
  234. struct sk_buff *skb;
  235. if (!buflen) {
  236. buflen = SKB_DATA_ALIGN(headroom + len) +
  237. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  238. }
  239. skb = build_skb(head, buflen);
  240. if (!skb)
  241. return NULL;
  242. skb_reserve(skb, headroom);
  243. skb_put(skb, len);
  244. return skb;
  245. }
  246. static int veth_select_rxq(struct net_device *dev)
  247. {
  248. return smp_processor_id() % dev->real_num_rx_queues;
  249. }
  250. static int veth_xdp_xmit(struct net_device *dev, int n,
  251. struct xdp_frame **frames, u32 flags)
  252. {
  253. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  254. struct net_device *rcv;
  255. unsigned int max_len;
  256. struct veth_rq *rq;
  257. int i, drops = 0;
  258. if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
  259. return -EINVAL;
  260. rcv = rcu_dereference(priv->peer);
  261. if (unlikely(!rcv))
  262. return -ENXIO;
  263. rcv_priv = netdev_priv(rcv);
  264. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  265. /* Non-NULL xdp_prog ensures that xdp_ring is initialized on receive
  266. * side. This means an XDP program is loaded on the peer and the peer
  267. * device is up.
  268. */
  269. if (!rcu_access_pointer(rq->xdp_prog))
  270. return -ENXIO;
  271. max_len = rcv->mtu + rcv->hard_header_len + VLAN_HLEN;
  272. spin_lock(&rq->xdp_ring.producer_lock);
  273. for (i = 0; i < n; i++) {
  274. struct xdp_frame *frame = frames[i];
  275. void *ptr = veth_xdp_to_ptr(frame);
  276. if (unlikely(frame->len > max_len ||
  277. __ptr_ring_produce(&rq->xdp_ring, ptr))) {
  278. xdp_return_frame_rx_napi(frame);
  279. drops++;
  280. }
  281. }
  282. spin_unlock(&rq->xdp_ring.producer_lock);
  283. if (flags & XDP_XMIT_FLUSH)
  284. __veth_xdp_flush(rq);
  285. return n - drops;
  286. }
  287. static void veth_xdp_flush(struct net_device *dev)
  288. {
  289. struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
  290. struct net_device *rcv;
  291. struct veth_rq *rq;
  292. rcu_read_lock();
  293. rcv = rcu_dereference(priv->peer);
  294. if (unlikely(!rcv))
  295. goto out;
  296. rcv_priv = netdev_priv(rcv);
  297. rq = &rcv_priv->rq[veth_select_rxq(rcv)];
  298. /* xdp_ring is initialized on receive side? */
  299. if (unlikely(!rcu_access_pointer(rq->xdp_prog)))
  300. goto out;
  301. __veth_xdp_flush(rq);
  302. out:
  303. rcu_read_unlock();
  304. }
  305. static int veth_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
  306. {
  307. struct xdp_frame *frame = convert_to_xdp_frame(xdp);
  308. if (unlikely(!frame))
  309. return -EOVERFLOW;
  310. return veth_xdp_xmit(dev, 1, &frame, 0);
  311. }
  312. static struct sk_buff *veth_xdp_rcv_one(struct veth_rq *rq,
  313. struct xdp_frame *frame,
  314. unsigned int *xdp_xmit)
  315. {
  316. void *hard_start = frame->data - frame->headroom;
  317. void *head = hard_start - sizeof(struct xdp_frame);
  318. int len = frame->len, delta = 0;
  319. struct xdp_frame orig_frame;
  320. struct bpf_prog *xdp_prog;
  321. unsigned int headroom;
  322. struct sk_buff *skb;
  323. rcu_read_lock();
  324. xdp_prog = rcu_dereference(rq->xdp_prog);
  325. if (likely(xdp_prog)) {
  326. struct xdp_buff xdp;
  327. u32 act;
  328. xdp.data_hard_start = hard_start;
  329. xdp.data = frame->data;
  330. xdp.data_end = frame->data + frame->len;
  331. xdp.data_meta = frame->data - frame->metasize;
  332. xdp.rxq = &rq->xdp_rxq;
  333. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  334. switch (act) {
  335. case XDP_PASS:
  336. delta = frame->data - xdp.data;
  337. len = xdp.data_end - xdp.data;
  338. break;
  339. case XDP_TX:
  340. orig_frame = *frame;
  341. xdp.data_hard_start = head;
  342. xdp.rxq->mem = frame->mem;
  343. if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
  344. trace_xdp_exception(rq->dev, xdp_prog, act);
  345. frame = &orig_frame;
  346. goto err_xdp;
  347. }
  348. *xdp_xmit |= VETH_XDP_TX;
  349. rcu_read_unlock();
  350. goto xdp_xmit;
  351. case XDP_REDIRECT:
  352. orig_frame = *frame;
  353. xdp.data_hard_start = head;
  354. xdp.rxq->mem = frame->mem;
  355. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog)) {
  356. frame = &orig_frame;
  357. goto err_xdp;
  358. }
  359. *xdp_xmit |= VETH_XDP_REDIR;
  360. rcu_read_unlock();
  361. goto xdp_xmit;
  362. default:
  363. bpf_warn_invalid_xdp_action(act);
  364. case XDP_ABORTED:
  365. trace_xdp_exception(rq->dev, xdp_prog, act);
  366. case XDP_DROP:
  367. goto err_xdp;
  368. }
  369. }
  370. rcu_read_unlock();
  371. headroom = sizeof(struct xdp_frame) + frame->headroom - delta;
  372. skb = veth_build_skb(head, headroom, len, 0);
  373. if (!skb) {
  374. xdp_return_frame(frame);
  375. goto err;
  376. }
  377. xdp_scrub_frame(frame);
  378. skb->protocol = eth_type_trans(skb, rq->dev);
  379. err:
  380. return skb;
  381. err_xdp:
  382. rcu_read_unlock();
  383. xdp_return_frame(frame);
  384. xdp_xmit:
  385. return NULL;
  386. }
  387. static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, struct sk_buff *skb,
  388. unsigned int *xdp_xmit)
  389. {
  390. u32 pktlen, headroom, act, metalen;
  391. void *orig_data, *orig_data_end;
  392. struct bpf_prog *xdp_prog;
  393. int mac_len, delta, off;
  394. struct xdp_buff xdp;
  395. rcu_read_lock();
  396. xdp_prog = rcu_dereference(rq->xdp_prog);
  397. if (unlikely(!xdp_prog)) {
  398. rcu_read_unlock();
  399. goto out;
  400. }
  401. mac_len = skb->data - skb_mac_header(skb);
  402. pktlen = skb->len + mac_len;
  403. headroom = skb_headroom(skb) - mac_len;
  404. if (skb_shared(skb) || skb_head_is_locked(skb) ||
  405. skb_is_nonlinear(skb) || headroom < XDP_PACKET_HEADROOM) {
  406. struct sk_buff *nskb;
  407. int size, head_off;
  408. void *head, *start;
  409. struct page *page;
  410. size = SKB_DATA_ALIGN(VETH_XDP_HEADROOM + pktlen) +
  411. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  412. if (size > PAGE_SIZE)
  413. goto drop;
  414. page = alloc_page(GFP_ATOMIC | __GFP_NOWARN);
  415. if (!page)
  416. goto drop;
  417. head = page_address(page);
  418. start = head + VETH_XDP_HEADROOM;
  419. if (skb_copy_bits(skb, -mac_len, start, pktlen)) {
  420. page_frag_free(head);
  421. goto drop;
  422. }
  423. nskb = veth_build_skb(head,
  424. VETH_XDP_HEADROOM + mac_len, skb->len,
  425. PAGE_SIZE);
  426. if (!nskb) {
  427. page_frag_free(head);
  428. goto drop;
  429. }
  430. skb_copy_header(nskb, skb);
  431. head_off = skb_headroom(nskb) - skb_headroom(skb);
  432. skb_headers_offset_update(nskb, head_off);
  433. if (skb->sk)
  434. skb_set_owner_w(nskb, skb->sk);
  435. consume_skb(skb);
  436. skb = nskb;
  437. }
  438. xdp.data_hard_start = skb->head;
  439. xdp.data = skb_mac_header(skb);
  440. xdp.data_end = xdp.data + pktlen;
  441. xdp.data_meta = xdp.data;
  442. xdp.rxq = &rq->xdp_rxq;
  443. orig_data = xdp.data;
  444. orig_data_end = xdp.data_end;
  445. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  446. switch (act) {
  447. case XDP_PASS:
  448. break;
  449. case XDP_TX:
  450. get_page(virt_to_page(xdp.data));
  451. consume_skb(skb);
  452. xdp.rxq->mem = rq->xdp_mem;
  453. if (unlikely(veth_xdp_tx(rq->dev, &xdp) < 0)) {
  454. trace_xdp_exception(rq->dev, xdp_prog, act);
  455. goto err_xdp;
  456. }
  457. *xdp_xmit |= VETH_XDP_TX;
  458. rcu_read_unlock();
  459. goto xdp_xmit;
  460. case XDP_REDIRECT:
  461. get_page(virt_to_page(xdp.data));
  462. consume_skb(skb);
  463. xdp.rxq->mem = rq->xdp_mem;
  464. if (xdp_do_redirect(rq->dev, &xdp, xdp_prog))
  465. goto err_xdp;
  466. *xdp_xmit |= VETH_XDP_REDIR;
  467. rcu_read_unlock();
  468. goto xdp_xmit;
  469. default:
  470. bpf_warn_invalid_xdp_action(act);
  471. case XDP_ABORTED:
  472. trace_xdp_exception(rq->dev, xdp_prog, act);
  473. case XDP_DROP:
  474. goto drop;
  475. }
  476. rcu_read_unlock();
  477. delta = orig_data - xdp.data;
  478. off = mac_len + delta;
  479. if (off > 0)
  480. __skb_push(skb, off);
  481. else if (off < 0)
  482. __skb_pull(skb, -off);
  483. skb->mac_header -= delta;
  484. off = xdp.data_end - orig_data_end;
  485. if (off != 0)
  486. __skb_put(skb, off);
  487. skb->protocol = eth_type_trans(skb, rq->dev);
  488. metalen = xdp.data - xdp.data_meta;
  489. if (metalen)
  490. skb_metadata_set(skb, metalen);
  491. out:
  492. return skb;
  493. drop:
  494. rcu_read_unlock();
  495. kfree_skb(skb);
  496. return NULL;
  497. err_xdp:
  498. rcu_read_unlock();
  499. page_frag_free(xdp.data);
  500. xdp_xmit:
  501. return NULL;
  502. }
  503. static int veth_xdp_rcv(struct veth_rq *rq, int budget, unsigned int *xdp_xmit)
  504. {
  505. int i, done = 0;
  506. for (i = 0; i < budget; i++) {
  507. void *ptr = __ptr_ring_consume(&rq->xdp_ring);
  508. struct sk_buff *skb;
  509. if (!ptr)
  510. break;
  511. if (veth_is_xdp_frame(ptr)) {
  512. skb = veth_xdp_rcv_one(rq, veth_ptr_to_xdp(ptr),
  513. xdp_xmit);
  514. } else {
  515. skb = veth_xdp_rcv_skb(rq, ptr, xdp_xmit);
  516. }
  517. if (skb)
  518. napi_gro_receive(&rq->xdp_napi, skb);
  519. done++;
  520. }
  521. return done;
  522. }
  523. static int veth_poll(struct napi_struct *napi, int budget)
  524. {
  525. struct veth_rq *rq =
  526. container_of(napi, struct veth_rq, xdp_napi);
  527. unsigned int xdp_xmit = 0;
  528. int done;
  529. xdp_set_return_frame_no_direct();
  530. done = veth_xdp_rcv(rq, budget, &xdp_xmit);
  531. if (done < budget && napi_complete_done(napi, done)) {
  532. /* Write rx_notify_masked before reading ptr_ring */
  533. smp_store_mb(rq->rx_notify_masked, false);
  534. if (unlikely(!__ptr_ring_empty(&rq->xdp_ring))) {
  535. rq->rx_notify_masked = true;
  536. napi_schedule(&rq->xdp_napi);
  537. }
  538. }
  539. if (xdp_xmit & VETH_XDP_TX)
  540. veth_xdp_flush(rq->dev);
  541. if (xdp_xmit & VETH_XDP_REDIR)
  542. xdp_do_flush_map();
  543. xdp_clear_return_frame_no_direct();
  544. return done;
  545. }
  546. static int veth_napi_add(struct net_device *dev)
  547. {
  548. struct veth_priv *priv = netdev_priv(dev);
  549. int err, i;
  550. for (i = 0; i < dev->real_num_rx_queues; i++) {
  551. struct veth_rq *rq = &priv->rq[i];
  552. err = ptr_ring_init(&rq->xdp_ring, VETH_RING_SIZE, GFP_KERNEL);
  553. if (err)
  554. goto err_xdp_ring;
  555. }
  556. for (i = 0; i < dev->real_num_rx_queues; i++) {
  557. struct veth_rq *rq = &priv->rq[i];
  558. netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
  559. napi_enable(&rq->xdp_napi);
  560. }
  561. return 0;
  562. err_xdp_ring:
  563. for (i--; i >= 0; i--)
  564. ptr_ring_cleanup(&priv->rq[i].xdp_ring, veth_ptr_free);
  565. return err;
  566. }
  567. static void veth_napi_del(struct net_device *dev)
  568. {
  569. struct veth_priv *priv = netdev_priv(dev);
  570. int i;
  571. for (i = 0; i < dev->real_num_rx_queues; i++) {
  572. struct veth_rq *rq = &priv->rq[i];
  573. napi_disable(&rq->xdp_napi);
  574. napi_hash_del(&rq->xdp_napi);
  575. }
  576. synchronize_net();
  577. for (i = 0; i < dev->real_num_rx_queues; i++) {
  578. struct veth_rq *rq = &priv->rq[i];
  579. netif_napi_del(&rq->xdp_napi);
  580. rq->rx_notify_masked = false;
  581. ptr_ring_cleanup(&rq->xdp_ring, veth_ptr_free);
  582. }
  583. }
  584. static int veth_enable_xdp(struct net_device *dev)
  585. {
  586. struct veth_priv *priv = netdev_priv(dev);
  587. int err, i;
  588. if (!xdp_rxq_info_is_reg(&priv->rq[0].xdp_rxq)) {
  589. for (i = 0; i < dev->real_num_rx_queues; i++) {
  590. struct veth_rq *rq = &priv->rq[i];
  591. err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i);
  592. if (err < 0)
  593. goto err_rxq_reg;
  594. err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
  595. MEM_TYPE_PAGE_SHARED,
  596. NULL);
  597. if (err < 0)
  598. goto err_reg_mem;
  599. /* Save original mem info as it can be overwritten */
  600. rq->xdp_mem = rq->xdp_rxq.mem;
  601. }
  602. err = veth_napi_add(dev);
  603. if (err)
  604. goto err_rxq_reg;
  605. }
  606. for (i = 0; i < dev->real_num_rx_queues; i++)
  607. rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
  608. return 0;
  609. err_reg_mem:
  610. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  611. err_rxq_reg:
  612. for (i--; i >= 0; i--)
  613. xdp_rxq_info_unreg(&priv->rq[i].xdp_rxq);
  614. return err;
  615. }
  616. static void veth_disable_xdp(struct net_device *dev)
  617. {
  618. struct veth_priv *priv = netdev_priv(dev);
  619. int i;
  620. for (i = 0; i < dev->real_num_rx_queues; i++)
  621. rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
  622. veth_napi_del(dev);
  623. for (i = 0; i < dev->real_num_rx_queues; i++) {
  624. struct veth_rq *rq = &priv->rq[i];
  625. rq->xdp_rxq.mem = rq->xdp_mem;
  626. xdp_rxq_info_unreg(&rq->xdp_rxq);
  627. }
  628. }
  629. static int veth_open(struct net_device *dev)
  630. {
  631. struct veth_priv *priv = netdev_priv(dev);
  632. struct net_device *peer = rtnl_dereference(priv->peer);
  633. int err;
  634. if (!peer)
  635. return -ENOTCONN;
  636. if (priv->_xdp_prog) {
  637. err = veth_enable_xdp(dev);
  638. if (err)
  639. return err;
  640. }
  641. if (peer->flags & IFF_UP) {
  642. netif_carrier_on(dev);
  643. netif_carrier_on(peer);
  644. }
  645. return 0;
  646. }
  647. static int veth_close(struct net_device *dev)
  648. {
  649. struct veth_priv *priv = netdev_priv(dev);
  650. struct net_device *peer = rtnl_dereference(priv->peer);
  651. netif_carrier_off(dev);
  652. if (peer)
  653. netif_carrier_off(peer);
  654. if (priv->_xdp_prog)
  655. veth_disable_xdp(dev);
  656. return 0;
  657. }
  658. static int is_valid_veth_mtu(int mtu)
  659. {
  660. return mtu >= ETH_MIN_MTU && mtu <= ETH_MAX_MTU;
  661. }
  662. static int veth_dev_init(struct net_device *dev)
  663. {
  664. dev->vstats = netdev_alloc_pcpu_stats(struct pcpu_vstats);
  665. if (!dev->vstats)
  666. return -ENOMEM;
  667. return 0;
  668. }
  669. static void veth_dev_free(struct net_device *dev)
  670. {
  671. free_percpu(dev->vstats);
  672. }
  673. #ifdef CONFIG_NET_POLL_CONTROLLER
  674. static void veth_poll_controller(struct net_device *dev)
  675. {
  676. /* veth only receives frames when its peer sends one
  677. * Since it has nothing to do with disabling irqs, we are guaranteed
  678. * never to have pending data when we poll for it so
  679. * there is nothing to do here.
  680. *
  681. * We need this though so netpoll recognizes us as an interface that
  682. * supports polling, which enables bridge devices in virt setups to
  683. * still use netconsole
  684. */
  685. }
  686. #endif /* CONFIG_NET_POLL_CONTROLLER */
  687. static int veth_get_iflink(const struct net_device *dev)
  688. {
  689. struct veth_priv *priv = netdev_priv(dev);
  690. struct net_device *peer;
  691. int iflink;
  692. rcu_read_lock();
  693. peer = rcu_dereference(priv->peer);
  694. iflink = peer ? peer->ifindex : 0;
  695. rcu_read_unlock();
  696. return iflink;
  697. }
  698. static netdev_features_t veth_fix_features(struct net_device *dev,
  699. netdev_features_t features)
  700. {
  701. struct veth_priv *priv = netdev_priv(dev);
  702. struct net_device *peer;
  703. peer = rtnl_dereference(priv->peer);
  704. if (peer) {
  705. struct veth_priv *peer_priv = netdev_priv(peer);
  706. if (peer_priv->_xdp_prog)
  707. features &= ~NETIF_F_GSO_SOFTWARE;
  708. }
  709. return features;
  710. }
  711. static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
  712. {
  713. struct veth_priv *peer_priv, *priv = netdev_priv(dev);
  714. struct net_device *peer;
  715. if (new_hr < 0)
  716. new_hr = 0;
  717. rcu_read_lock();
  718. peer = rcu_dereference(priv->peer);
  719. if (unlikely(!peer))
  720. goto out;
  721. peer_priv = netdev_priv(peer);
  722. priv->requested_headroom = new_hr;
  723. new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
  724. dev->needed_headroom = new_hr;
  725. peer->needed_headroom = new_hr;
  726. out:
  727. rcu_read_unlock();
  728. }
  729. static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
  730. struct netlink_ext_ack *extack)
  731. {
  732. struct veth_priv *priv = netdev_priv(dev);
  733. struct bpf_prog *old_prog;
  734. struct net_device *peer;
  735. unsigned int max_mtu;
  736. int err;
  737. old_prog = priv->_xdp_prog;
  738. priv->_xdp_prog = prog;
  739. peer = rtnl_dereference(priv->peer);
  740. if (prog) {
  741. if (!peer) {
  742. NL_SET_ERR_MSG_MOD(extack, "Cannot set XDP when peer is detached");
  743. err = -ENOTCONN;
  744. goto err;
  745. }
  746. max_mtu = PAGE_SIZE - VETH_XDP_HEADROOM -
  747. peer->hard_header_len -
  748. SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
  749. if (peer->mtu > max_mtu) {
  750. NL_SET_ERR_MSG_MOD(extack, "Peer MTU is too large to set XDP");
  751. err = -ERANGE;
  752. goto err;
  753. }
  754. if (dev->real_num_rx_queues < peer->real_num_tx_queues) {
  755. NL_SET_ERR_MSG_MOD(extack, "XDP expects number of rx queues not less than peer tx queues");
  756. err = -ENOSPC;
  757. goto err;
  758. }
  759. if (dev->flags & IFF_UP) {
  760. err = veth_enable_xdp(dev);
  761. if (err) {
  762. NL_SET_ERR_MSG_MOD(extack, "Setup for XDP failed");
  763. goto err;
  764. }
  765. }
  766. if (!old_prog) {
  767. peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
  768. peer->max_mtu = max_mtu;
  769. }
  770. }
  771. if (old_prog) {
  772. if (!prog) {
  773. if (dev->flags & IFF_UP)
  774. veth_disable_xdp(dev);
  775. if (peer) {
  776. peer->hw_features |= NETIF_F_GSO_SOFTWARE;
  777. peer->max_mtu = ETH_MAX_MTU;
  778. }
  779. }
  780. bpf_prog_put(old_prog);
  781. }
  782. if ((!!old_prog ^ !!prog) && peer)
  783. netdev_update_features(peer);
  784. return 0;
  785. err:
  786. priv->_xdp_prog = old_prog;
  787. return err;
  788. }
  789. static u32 veth_xdp_query(struct net_device *dev)
  790. {
  791. struct veth_priv *priv = netdev_priv(dev);
  792. const struct bpf_prog *xdp_prog;
  793. xdp_prog = priv->_xdp_prog;
  794. if (xdp_prog)
  795. return xdp_prog->aux->id;
  796. return 0;
  797. }
  798. static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
  799. {
  800. switch (xdp->command) {
  801. case XDP_SETUP_PROG:
  802. return veth_xdp_set(dev, xdp->prog, xdp->extack);
  803. case XDP_QUERY_PROG:
  804. xdp->prog_id = veth_xdp_query(dev);
  805. return 0;
  806. default:
  807. return -EINVAL;
  808. }
  809. }
  810. static const struct net_device_ops veth_netdev_ops = {
  811. .ndo_init = veth_dev_init,
  812. .ndo_open = veth_open,
  813. .ndo_stop = veth_close,
  814. .ndo_start_xmit = veth_xmit,
  815. .ndo_get_stats64 = veth_get_stats64,
  816. .ndo_set_rx_mode = veth_set_multicast_list,
  817. .ndo_set_mac_address = eth_mac_addr,
  818. #ifdef CONFIG_NET_POLL_CONTROLLER
  819. .ndo_poll_controller = veth_poll_controller,
  820. #endif
  821. .ndo_get_iflink = veth_get_iflink,
  822. .ndo_fix_features = veth_fix_features,
  823. .ndo_features_check = passthru_features_check,
  824. .ndo_set_rx_headroom = veth_set_rx_headroom,
  825. .ndo_bpf = veth_xdp,
  826. .ndo_xdp_xmit = veth_xdp_xmit,
  827. };
  828. #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
  829. NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | \
  830. NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \
  831. NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | \
  832. NETIF_F_HW_VLAN_STAG_TX | NETIF_F_HW_VLAN_STAG_RX )
  833. static void veth_setup(struct net_device *dev)
  834. {
  835. ether_setup(dev);
  836. dev->priv_flags &= ~IFF_TX_SKB_SHARING;
  837. dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
  838. dev->priv_flags |= IFF_NO_QUEUE;
  839. dev->priv_flags |= IFF_PHONY_HEADROOM;
  840. dev->netdev_ops = &veth_netdev_ops;
  841. dev->ethtool_ops = &veth_ethtool_ops;
  842. dev->features |= NETIF_F_LLTX;
  843. dev->features |= VETH_FEATURES;
  844. dev->vlan_features = dev->features &
  845. ~(NETIF_F_HW_VLAN_CTAG_TX |
  846. NETIF_F_HW_VLAN_STAG_TX |
  847. NETIF_F_HW_VLAN_CTAG_RX |
  848. NETIF_F_HW_VLAN_STAG_RX);
  849. dev->needs_free_netdev = true;
  850. dev->priv_destructor = veth_dev_free;
  851. dev->max_mtu = ETH_MAX_MTU;
  852. dev->hw_features = VETH_FEATURES;
  853. dev->hw_enc_features = VETH_FEATURES;
  854. dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
  855. }
  856. /*
  857. * netlink interface
  858. */
  859. static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
  860. struct netlink_ext_ack *extack)
  861. {
  862. if (tb[IFLA_ADDRESS]) {
  863. if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
  864. return -EINVAL;
  865. if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
  866. return -EADDRNOTAVAIL;
  867. }
  868. if (tb[IFLA_MTU]) {
  869. if (!is_valid_veth_mtu(nla_get_u32(tb[IFLA_MTU])))
  870. return -EINVAL;
  871. }
  872. return 0;
  873. }
  874. static int veth_alloc_queues(struct net_device *dev)
  875. {
  876. struct veth_priv *priv = netdev_priv(dev);
  877. priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
  878. if (!priv->rq)
  879. return -ENOMEM;
  880. return 0;
  881. }
  882. static void veth_free_queues(struct net_device *dev)
  883. {
  884. struct veth_priv *priv = netdev_priv(dev);
  885. kfree(priv->rq);
  886. }
  887. static struct rtnl_link_ops veth_link_ops;
  888. static int veth_newlink(struct net *src_net, struct net_device *dev,
  889. struct nlattr *tb[], struct nlattr *data[],
  890. struct netlink_ext_ack *extack)
  891. {
  892. int err, i;
  893. struct net_device *peer;
  894. struct veth_priv *priv;
  895. char ifname[IFNAMSIZ];
  896. struct nlattr *peer_tb[IFLA_MAX + 1], **tbp;
  897. unsigned char name_assign_type;
  898. struct ifinfomsg *ifmp;
  899. struct net *net;
  900. /*
  901. * create and register peer first
  902. */
  903. if (data != NULL && data[VETH_INFO_PEER] != NULL) {
  904. struct nlattr *nla_peer;
  905. nla_peer = data[VETH_INFO_PEER];
  906. ifmp = nla_data(nla_peer);
  907. err = rtnl_nla_parse_ifla(peer_tb,
  908. nla_data(nla_peer) + sizeof(struct ifinfomsg),
  909. nla_len(nla_peer) - sizeof(struct ifinfomsg),
  910. NULL);
  911. if (err < 0)
  912. return err;
  913. err = veth_validate(peer_tb, NULL, extack);
  914. if (err < 0)
  915. return err;
  916. tbp = peer_tb;
  917. } else {
  918. ifmp = NULL;
  919. tbp = tb;
  920. }
  921. if (ifmp && tbp[IFLA_IFNAME]) {
  922. nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ);
  923. name_assign_type = NET_NAME_USER;
  924. } else {
  925. snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d");
  926. name_assign_type = NET_NAME_ENUM;
  927. }
  928. net = rtnl_link_get_net(src_net, tbp);
  929. if (IS_ERR(net))
  930. return PTR_ERR(net);
  931. peer = rtnl_create_link(net, ifname, name_assign_type,
  932. &veth_link_ops, tbp);
  933. if (IS_ERR(peer)) {
  934. put_net(net);
  935. return PTR_ERR(peer);
  936. }
  937. err = veth_alloc_queues(peer);
  938. if (err) {
  939. put_net(net);
  940. goto err_peer_alloc_queues;
  941. }
  942. if (!ifmp || !tbp[IFLA_ADDRESS])
  943. eth_hw_addr_random(peer);
  944. if (ifmp && (dev->ifindex != 0))
  945. peer->ifindex = ifmp->ifi_index;
  946. peer->gso_max_size = dev->gso_max_size;
  947. peer->gso_max_segs = dev->gso_max_segs;
  948. err = register_netdevice(peer);
  949. put_net(net);
  950. net = NULL;
  951. if (err < 0)
  952. goto err_register_peer;
  953. netif_carrier_off(peer);
  954. err = rtnl_configure_link(peer, ifmp);
  955. if (err < 0)
  956. goto err_configure_peer;
  957. /*
  958. * register dev last
  959. *
  960. * note, that since we've registered new device the dev's name
  961. * should be re-allocated
  962. */
  963. err = veth_alloc_queues(dev);
  964. if (err)
  965. goto err_alloc_queues;
  966. if (tb[IFLA_ADDRESS] == NULL)
  967. eth_hw_addr_random(dev);
  968. if (tb[IFLA_IFNAME])
  969. nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
  970. else
  971. snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d");
  972. err = register_netdevice(dev);
  973. if (err < 0)
  974. goto err_register_dev;
  975. netif_carrier_off(dev);
  976. /*
  977. * tie the deviced together
  978. */
  979. priv = netdev_priv(dev);
  980. for (i = 0; i < dev->real_num_rx_queues; i++)
  981. priv->rq[i].dev = dev;
  982. rcu_assign_pointer(priv->peer, peer);
  983. priv = netdev_priv(peer);
  984. for (i = 0; i < peer->real_num_rx_queues; i++)
  985. priv->rq[i].dev = peer;
  986. rcu_assign_pointer(priv->peer, dev);
  987. return 0;
  988. err_register_dev:
  989. veth_free_queues(dev);
  990. err_alloc_queues:
  991. /* nothing to do */
  992. err_configure_peer:
  993. unregister_netdevice(peer);
  994. return err;
  995. err_register_peer:
  996. veth_free_queues(peer);
  997. err_peer_alloc_queues:
  998. free_netdev(peer);
  999. return err;
  1000. }
  1001. static void veth_dellink(struct net_device *dev, struct list_head *head)
  1002. {
  1003. struct veth_priv *priv;
  1004. struct net_device *peer;
  1005. priv = netdev_priv(dev);
  1006. peer = rtnl_dereference(priv->peer);
  1007. /* Note : dellink() is called from default_device_exit_batch(),
  1008. * before a rcu_synchronize() point. The devices are guaranteed
  1009. * not being freed before one RCU grace period.
  1010. */
  1011. RCU_INIT_POINTER(priv->peer, NULL);
  1012. unregister_netdevice_queue(dev, head);
  1013. if (peer) {
  1014. priv = netdev_priv(peer);
  1015. RCU_INIT_POINTER(priv->peer, NULL);
  1016. unregister_netdevice_queue(peer, head);
  1017. }
  1018. }
  1019. static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = {
  1020. [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) },
  1021. };
  1022. static struct net *veth_get_link_net(const struct net_device *dev)
  1023. {
  1024. struct veth_priv *priv = netdev_priv(dev);
  1025. struct net_device *peer = rtnl_dereference(priv->peer);
  1026. return peer ? dev_net(peer) : dev_net(dev);
  1027. }
  1028. static struct rtnl_link_ops veth_link_ops = {
  1029. .kind = DRV_NAME,
  1030. .priv_size = sizeof(struct veth_priv),
  1031. .setup = veth_setup,
  1032. .validate = veth_validate,
  1033. .newlink = veth_newlink,
  1034. .dellink = veth_dellink,
  1035. .policy = veth_policy,
  1036. .maxtype = VETH_INFO_MAX,
  1037. .get_link_net = veth_get_link_net,
  1038. };
  1039. /*
  1040. * init/fini
  1041. */
  1042. static __init int veth_init(void)
  1043. {
  1044. return rtnl_link_register(&veth_link_ops);
  1045. }
  1046. static __exit void veth_exit(void)
  1047. {
  1048. rtnl_link_unregister(&veth_link_ops);
  1049. }
  1050. module_init(veth_init);
  1051. module_exit(veth_exit);
  1052. MODULE_DESCRIPTION("Virtual Ethernet Tunnel");
  1053. MODULE_LICENSE("GPL v2");
  1054. MODULE_ALIAS_RTNL_LINK(DRV_NAME);