af_smc.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620
  1. /*
  2. * Shared Memory Communications over RDMA (SMC-R) and RoCE
  3. *
  4. * AF_SMC protocol family socket handler keeping the AF_INET sock address type
  5. * applies to SOCK_STREAM sockets only
  6. * offers an alternative communication option for TCP-protocol sockets
  7. * applicable with RoCE-cards only
  8. *
  9. * Copyright IBM Corp. 2016
  10. *
  11. * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
  12. * based on prototype from Frank Blaschka
  13. */
  14. #define KMSG_COMPONENT "smc"
  15. #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
  16. #include <linux/module.h>
  17. #include <linux/socket.h>
  18. #include <net/sock.h>
  19. #include "smc.h"
  20. static void smc_set_keepalive(struct sock *sk, int val)
  21. {
  22. struct smc_sock *smc = smc_sk(sk);
  23. smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
  24. }
  25. static struct proto smc_proto = {
  26. .name = "SMC",
  27. .owner = THIS_MODULE,
  28. .keepalive = smc_set_keepalive,
  29. .obj_size = sizeof(struct smc_sock),
  30. .slab_flags = SLAB_DESTROY_BY_RCU,
  31. };
  32. static int smc_release(struct socket *sock)
  33. {
  34. struct sock *sk = sock->sk;
  35. struct smc_sock *smc;
  36. if (!sk)
  37. goto out;
  38. smc = smc_sk(sk);
  39. lock_sock(sk);
  40. sk->sk_state = SMC_CLOSED;
  41. if (smc->clcsock) {
  42. sock_release(smc->clcsock);
  43. smc->clcsock = NULL;
  44. }
  45. /* detach socket */
  46. sock_orphan(sk);
  47. sock->sk = NULL;
  48. release_sock(sk);
  49. sock_put(sk);
  50. out:
  51. return 0;
  52. }
  53. static void smc_destruct(struct sock *sk)
  54. {
  55. if (sk->sk_state != SMC_CLOSED)
  56. return;
  57. if (!sock_flag(sk, SOCK_DEAD))
  58. return;
  59. sk_refcnt_debug_dec(sk);
  60. }
  61. static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
  62. {
  63. struct smc_sock *smc;
  64. struct sock *sk;
  65. sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
  66. if (!sk)
  67. return NULL;
  68. sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
  69. sk->sk_state = SMC_INIT;
  70. sk->sk_destruct = smc_destruct;
  71. sk->sk_protocol = SMCPROTO_SMC;
  72. sk_refcnt_debug_inc(sk);
  73. smc = smc_sk(sk);
  74. return sk;
  75. }
  76. static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
  77. int addr_len)
  78. {
  79. struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
  80. struct sock *sk = sock->sk;
  81. struct smc_sock *smc;
  82. int rc;
  83. smc = smc_sk(sk);
  84. /* replicate tests from inet_bind(), to be safe wrt. future changes */
  85. rc = -EINVAL;
  86. if (addr_len < sizeof(struct sockaddr_in))
  87. goto out;
  88. rc = -EAFNOSUPPORT;
  89. /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
  90. if ((addr->sin_family != AF_INET) &&
  91. ((addr->sin_family != AF_UNSPEC) ||
  92. (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
  93. goto out;
  94. lock_sock(sk);
  95. /* Check if socket is already active */
  96. rc = -EINVAL;
  97. if (sk->sk_state != SMC_INIT)
  98. goto out_rel;
  99. smc->clcsock->sk->sk_reuse = sk->sk_reuse;
  100. rc = kernel_bind(smc->clcsock, uaddr, addr_len);
  101. out_rel:
  102. release_sock(sk);
  103. out:
  104. return rc;
  105. }
  106. static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
  107. unsigned long mask)
  108. {
  109. /* options we don't get control via setsockopt for */
  110. nsk->sk_type = osk->sk_type;
  111. nsk->sk_sndbuf = osk->sk_sndbuf;
  112. nsk->sk_rcvbuf = osk->sk_rcvbuf;
  113. nsk->sk_sndtimeo = osk->sk_sndtimeo;
  114. nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
  115. nsk->sk_mark = osk->sk_mark;
  116. nsk->sk_priority = osk->sk_priority;
  117. nsk->sk_rcvlowat = osk->sk_rcvlowat;
  118. nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
  119. nsk->sk_err = osk->sk_err;
  120. nsk->sk_flags &= ~mask;
  121. nsk->sk_flags |= osk->sk_flags & mask;
  122. }
  123. #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
  124. (1UL << SOCK_KEEPOPEN) | \
  125. (1UL << SOCK_LINGER) | \
  126. (1UL << SOCK_BROADCAST) | \
  127. (1UL << SOCK_TIMESTAMP) | \
  128. (1UL << SOCK_DBG) | \
  129. (1UL << SOCK_RCVTSTAMP) | \
  130. (1UL << SOCK_RCVTSTAMPNS) | \
  131. (1UL << SOCK_LOCALROUTE) | \
  132. (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
  133. (1UL << SOCK_RXQ_OVFL) | \
  134. (1UL << SOCK_WIFI_STATUS) | \
  135. (1UL << SOCK_NOFCS) | \
  136. (1UL << SOCK_FILTER_LOCKED))
  137. /* copy only relevant settings and flags of SOL_SOCKET level from smc to
  138. * clc socket (since smc is not called for these options from net/core)
  139. */
  140. static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
  141. {
  142. smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
  143. }
  144. #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
  145. (1UL << SOCK_KEEPOPEN) | \
  146. (1UL << SOCK_LINGER) | \
  147. (1UL << SOCK_DBG))
  148. /* copy only settings and flags relevant for smc from clc to smc socket */
  149. static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
  150. {
  151. smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
  152. }
  153. static int smc_connect(struct socket *sock, struct sockaddr *addr,
  154. int alen, int flags)
  155. {
  156. struct sock *sk = sock->sk;
  157. struct smc_sock *smc;
  158. int rc = -EINVAL;
  159. smc = smc_sk(sk);
  160. /* separate smc parameter checking to be safe */
  161. if (alen < sizeof(addr->sa_family))
  162. goto out_err;
  163. if (addr->sa_family != AF_INET)
  164. goto out_err;
  165. lock_sock(sk);
  166. switch (sk->sk_state) {
  167. default:
  168. goto out;
  169. case SMC_ACTIVE:
  170. rc = -EISCONN;
  171. goto out;
  172. case SMC_INIT:
  173. rc = 0;
  174. break;
  175. }
  176. smc_copy_sock_settings_to_clc(smc);
  177. rc = kernel_connect(smc->clcsock, addr, alen, flags);
  178. if (rc)
  179. goto out;
  180. sk->sk_state = SMC_ACTIVE;
  181. /* always use TCP fallback as transport mechanism for now;
  182. * This will change once RDMA transport is implemented
  183. */
  184. smc->use_fallback = true;
  185. out:
  186. release_sock(sk);
  187. out_err:
  188. return rc;
  189. }
  190. static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
  191. {
  192. struct sock *sk = &lsmc->sk;
  193. struct socket *new_clcsock;
  194. struct sock *new_sk;
  195. int rc;
  196. new_sk = smc_sock_alloc(sock_net(sk), NULL);
  197. if (!new_sk) {
  198. rc = -ENOMEM;
  199. lsmc->sk.sk_err = ENOMEM;
  200. *new_smc = NULL;
  201. goto out;
  202. }
  203. *new_smc = smc_sk(new_sk);
  204. rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
  205. if (rc) {
  206. sock_put(new_sk);
  207. *new_smc = NULL;
  208. goto out;
  209. }
  210. (*new_smc)->clcsock = new_clcsock;
  211. out:
  212. return rc;
  213. }
  214. static int smc_listen(struct socket *sock, int backlog)
  215. {
  216. struct sock *sk = sock->sk;
  217. struct smc_sock *smc;
  218. int rc;
  219. smc = smc_sk(sk);
  220. lock_sock(sk);
  221. rc = -EINVAL;
  222. if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
  223. goto out;
  224. rc = 0;
  225. if (sk->sk_state == SMC_LISTEN) {
  226. sk->sk_max_ack_backlog = backlog;
  227. goto out;
  228. }
  229. /* some socket options are handled in core, so we could not apply
  230. * them to the clc socket -- copy smc socket options to clc socket
  231. */
  232. smc_copy_sock_settings_to_clc(smc);
  233. rc = kernel_listen(smc->clcsock, backlog);
  234. if (rc)
  235. goto out;
  236. sk->sk_max_ack_backlog = backlog;
  237. sk->sk_ack_backlog = 0;
  238. sk->sk_state = SMC_LISTEN;
  239. out:
  240. release_sock(sk);
  241. return rc;
  242. }
  243. static int smc_accept(struct socket *sock, struct socket *new_sock,
  244. int flags)
  245. {
  246. struct smc_sock *new_smc;
  247. struct sock *sk = sock->sk;
  248. struct smc_sock *lsmc;
  249. int rc;
  250. lsmc = smc_sk(sk);
  251. lock_sock(sk);
  252. if (lsmc->sk.sk_state != SMC_LISTEN) {
  253. rc = -EINVAL;
  254. goto out;
  255. }
  256. rc = smc_clcsock_accept(lsmc, &new_smc);
  257. if (rc)
  258. goto out;
  259. sock_graft(&new_smc->sk, new_sock);
  260. new_smc->sk.sk_state = SMC_ACTIVE;
  261. smc_copy_sock_settings_to_smc(new_smc);
  262. /* always use TCP fallback as transport mechanism for now;
  263. * This will change once RDMA transport is implemented
  264. */
  265. new_smc->use_fallback = true;
  266. out:
  267. release_sock(sk);
  268. return rc;
  269. }
  270. static int smc_getname(struct socket *sock, struct sockaddr *addr,
  271. int *len, int peer)
  272. {
  273. struct smc_sock *smc;
  274. if (peer && (sock->sk->sk_state != SMC_ACTIVE))
  275. return -ENOTCONN;
  276. smc = smc_sk(sock->sk);
  277. return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
  278. }
  279. static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
  280. {
  281. struct sock *sk = sock->sk;
  282. struct smc_sock *smc;
  283. int rc = -EPIPE;
  284. smc = smc_sk(sk);
  285. lock_sock(sk);
  286. if (sk->sk_state != SMC_ACTIVE)
  287. goto out;
  288. if (smc->use_fallback)
  289. rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
  290. else
  291. rc = sock_no_sendmsg(sock, msg, len);
  292. out:
  293. release_sock(sk);
  294. return rc;
  295. }
  296. static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
  297. int flags)
  298. {
  299. struct sock *sk = sock->sk;
  300. struct smc_sock *smc;
  301. int rc = -ENOTCONN;
  302. smc = smc_sk(sk);
  303. lock_sock(sk);
  304. if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
  305. goto out;
  306. if (smc->use_fallback)
  307. rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
  308. else
  309. rc = sock_no_recvmsg(sock, msg, len, flags);
  310. out:
  311. release_sock(sk);
  312. return rc;
  313. }
  314. static unsigned int smc_poll(struct file *file, struct socket *sock,
  315. poll_table *wait)
  316. {
  317. struct sock *sk = sock->sk;
  318. unsigned int mask = 0;
  319. struct smc_sock *smc;
  320. smc = smc_sk(sock->sk);
  321. if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
  322. smc->use_fallback) {
  323. mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
  324. /* if non-blocking connect finished ... */
  325. lock_sock(sk);
  326. if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
  327. sk->sk_state = SMC_ACTIVE;
  328. /* always use TCP fallback as transport mechanism;
  329. * This will change once RDMA transport is implemented
  330. */
  331. smc->use_fallback = true;
  332. }
  333. release_sock(sk);
  334. } else {
  335. mask = sock_no_poll(file, sock, wait);
  336. }
  337. return mask;
  338. }
  339. static int smc_shutdown(struct socket *sock, int how)
  340. {
  341. struct sock *sk = sock->sk;
  342. struct smc_sock *smc;
  343. int rc = -EINVAL;
  344. smc = smc_sk(sk);
  345. if ((how < SHUT_RD) || (how > SHUT_RDWR))
  346. goto out_err;
  347. lock_sock(sk);
  348. rc = -ENOTCONN;
  349. if (sk->sk_state == SMC_CLOSED)
  350. goto out;
  351. if (smc->use_fallback) {
  352. rc = kernel_sock_shutdown(smc->clcsock, how);
  353. sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
  354. if (sk->sk_shutdown == SHUTDOWN_MASK)
  355. sk->sk_state = SMC_CLOSED;
  356. } else {
  357. rc = sock_no_shutdown(sock, how);
  358. }
  359. out:
  360. release_sock(sk);
  361. out_err:
  362. return rc;
  363. }
  364. static int smc_setsockopt(struct socket *sock, int level, int optname,
  365. char __user *optval, unsigned int optlen)
  366. {
  367. struct sock *sk = sock->sk;
  368. struct smc_sock *smc;
  369. smc = smc_sk(sk);
  370. /* generic setsockopts reaching us here always apply to the
  371. * CLC socket
  372. */
  373. return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
  374. optval, optlen);
  375. }
  376. static int smc_getsockopt(struct socket *sock, int level, int optname,
  377. char __user *optval, int __user *optlen)
  378. {
  379. struct smc_sock *smc;
  380. smc = smc_sk(sock->sk);
  381. /* socket options apply to the CLC socket */
  382. return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
  383. optval, optlen);
  384. }
  385. static int smc_ioctl(struct socket *sock, unsigned int cmd,
  386. unsigned long arg)
  387. {
  388. struct smc_sock *smc;
  389. smc = smc_sk(sock->sk);
  390. if (smc->use_fallback)
  391. return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
  392. else
  393. return sock_no_ioctl(sock, cmd, arg);
  394. }
  395. static ssize_t smc_sendpage(struct socket *sock, struct page *page,
  396. int offset, size_t size, int flags)
  397. {
  398. struct sock *sk = sock->sk;
  399. struct smc_sock *smc;
  400. int rc = -EPIPE;
  401. smc = smc_sk(sk);
  402. lock_sock(sk);
  403. if (sk->sk_state != SMC_ACTIVE)
  404. goto out;
  405. if (smc->use_fallback)
  406. rc = kernel_sendpage(smc->clcsock, page, offset,
  407. size, flags);
  408. else
  409. rc = sock_no_sendpage(sock, page, offset, size, flags);
  410. out:
  411. release_sock(sk);
  412. return rc;
  413. }
  414. static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
  415. struct pipe_inode_info *pipe, size_t len,
  416. unsigned int flags)
  417. {
  418. struct sock *sk = sock->sk;
  419. struct smc_sock *smc;
  420. int rc = -ENOTCONN;
  421. smc = smc_sk(sk);
  422. lock_sock(sk);
  423. if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
  424. goto out;
  425. if (smc->use_fallback) {
  426. rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
  427. pipe, len, flags);
  428. } else {
  429. rc = -EOPNOTSUPP;
  430. }
  431. out:
  432. release_sock(sk);
  433. return rc;
  434. }
  435. /* must look like tcp */
  436. static const struct proto_ops smc_sock_ops = {
  437. .family = PF_SMC,
  438. .owner = THIS_MODULE,
  439. .release = smc_release,
  440. .bind = smc_bind,
  441. .connect = smc_connect,
  442. .socketpair = sock_no_socketpair,
  443. .accept = smc_accept,
  444. .getname = smc_getname,
  445. .poll = smc_poll,
  446. .ioctl = smc_ioctl,
  447. .listen = smc_listen,
  448. .shutdown = smc_shutdown,
  449. .setsockopt = smc_setsockopt,
  450. .getsockopt = smc_getsockopt,
  451. .sendmsg = smc_sendmsg,
  452. .recvmsg = smc_recvmsg,
  453. .mmap = sock_no_mmap,
  454. .sendpage = smc_sendpage,
  455. .splice_read = smc_splice_read,
  456. };
  457. static int smc_create(struct net *net, struct socket *sock, int protocol,
  458. int kern)
  459. {
  460. struct smc_sock *smc;
  461. struct sock *sk;
  462. int rc;
  463. rc = -ESOCKTNOSUPPORT;
  464. if (sock->type != SOCK_STREAM)
  465. goto out;
  466. rc = -EPROTONOSUPPORT;
  467. if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
  468. goto out;
  469. rc = -ENOBUFS;
  470. sock->ops = &smc_sock_ops;
  471. sk = smc_sock_alloc(net, sock);
  472. if (!sk)
  473. goto out;
  474. /* create internal TCP socket for CLC handshake and fallback */
  475. smc = smc_sk(sk);
  476. rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
  477. IPPROTO_TCP, &smc->clcsock);
  478. if (rc)
  479. sk_common_release(sk);
  480. out:
  481. return rc;
  482. }
  483. static const struct net_proto_family smc_sock_family_ops = {
  484. .family = PF_SMC,
  485. .owner = THIS_MODULE,
  486. .create = smc_create,
  487. };
  488. static int __init smc_init(void)
  489. {
  490. int rc;
  491. rc = proto_register(&smc_proto, 1);
  492. if (rc) {
  493. pr_err("%s: proto_register fails with %d\n", __func__, rc);
  494. goto out;
  495. }
  496. rc = sock_register(&smc_sock_family_ops);
  497. if (rc) {
  498. pr_err("%s: sock_register fails with %d\n", __func__, rc);
  499. goto out_proto;
  500. }
  501. return 0;
  502. out_proto:
  503. proto_unregister(&smc_proto);
  504. out:
  505. return rc;
  506. }
  507. static void __exit smc_exit(void)
  508. {
  509. sock_unregister(PF_SMC);
  510. proto_unregister(&smc_proto);
  511. }
  512. module_init(smc_init);
  513. module_exit(smc_exit);
  514. MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
  515. MODULE_DESCRIPTION("smc socket address family");
  516. MODULE_LICENSE("GPL");
  517. MODULE_ALIAS_NETPROTO(PF_SMC);