nbd.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134
  1. /*
  2. * Network block device - make block devices work over TCP
  3. *
  4. * Note that you can not swap over this thing, yet. Seems to work but
  5. * deadlocks sometimes - you can not swap over TCP in general.
  6. *
  7. * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
  8. * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
  9. *
  10. * This file is released under GPLv2 or later.
  11. *
  12. * (part of code stolen from loop.c)
  13. */
  14. #include <linux/major.h>
  15. #include <linux/blkdev.h>
  16. #include <linux/module.h>
  17. #include <linux/init.h>
  18. #include <linux/sched.h>
  19. #include <linux/fs.h>
  20. #include <linux/bio.h>
  21. #include <linux/stat.h>
  22. #include <linux/errno.h>
  23. #include <linux/file.h>
  24. #include <linux/ioctl.h>
  25. #include <linux/mutex.h>
  26. #include <linux/compiler.h>
  27. #include <linux/err.h>
  28. #include <linux/kernel.h>
  29. #include <linux/slab.h>
  30. #include <net/sock.h>
  31. #include <linux/net.h>
  32. #include <linux/kthread.h>
  33. #include <linux/types.h>
  34. #include <linux/debugfs.h>
  35. #include <asm/uaccess.h>
  36. #include <asm/types.h>
  37. #include <linux/nbd.h>
  38. struct nbd_device {
  39. u32 flags;
  40. struct socket * sock; /* If == NULL, device is not ready, yet */
  41. int magic;
  42. spinlock_t queue_lock;
  43. struct list_head queue_head; /* Requests waiting result */
  44. struct request *active_req;
  45. wait_queue_head_t active_wq;
  46. struct list_head waiting_queue; /* Requests to be sent */
  47. wait_queue_head_t waiting_wq;
  48. struct mutex tx_lock;
  49. struct gendisk *disk;
  50. int blksize;
  51. loff_t bytesize;
  52. int xmit_timeout;
  53. bool timedout;
  54. bool disconnect; /* a disconnect has been requested by user */
  55. struct timer_list timeout_timer;
  56. /* protects initialization and shutdown of the socket */
  57. spinlock_t sock_lock;
  58. struct task_struct *task_recv;
  59. struct task_struct *task_send;
  60. #if IS_ENABLED(CONFIG_DEBUG_FS)
  61. struct dentry *dbg_dir;
  62. #endif
  63. };
  64. #if IS_ENABLED(CONFIG_DEBUG_FS)
  65. static struct dentry *nbd_dbg_dir;
  66. #endif
  67. #define nbd_name(nbd) ((nbd)->disk->disk_name)
  68. #define NBD_MAGIC 0x68797548
  69. static unsigned int nbds_max = 16;
  70. static struct nbd_device *nbd_dev;
  71. static int max_part;
  72. /*
  73. * Use just one lock (or at most 1 per NIC). Two arguments for this:
  74. * 1. Each NIC is essentially a synchronization point for all servers
  75. * accessed through that NIC so there's no need to have more locks
  76. * than NICs anyway.
  77. * 2. More locks lead to more "Dirty cache line bouncing" which will slow
  78. * down each lock to the point where they're actually slower than just
  79. * a single lock.
  80. * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this!
  81. */
  82. static DEFINE_SPINLOCK(nbd_lock);
  83. static inline struct device *nbd_to_dev(struct nbd_device *nbd)
  84. {
  85. return disk_to_dev(nbd->disk);
  86. }
  87. static bool nbd_is_connected(struct nbd_device *nbd)
  88. {
  89. return !!nbd->task_recv;
  90. }
  91. static const char *nbdcmd_to_ascii(int cmd)
  92. {
  93. switch (cmd) {
  94. case NBD_CMD_READ: return "read";
  95. case NBD_CMD_WRITE: return "write";
  96. case NBD_CMD_DISC: return "disconnect";
  97. case NBD_CMD_FLUSH: return "flush";
  98. case NBD_CMD_TRIM: return "trim/discard";
  99. }
  100. return "invalid";
  101. }
  102. static int nbd_size_clear(struct nbd_device *nbd, struct block_device *bdev)
  103. {
  104. bdev->bd_inode->i_size = 0;
  105. set_capacity(nbd->disk, 0);
  106. kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
  107. return 0;
  108. }
  109. static void nbd_size_update(struct nbd_device *nbd, struct block_device *bdev)
  110. {
  111. if (!nbd_is_connected(nbd))
  112. return;
  113. bdev->bd_inode->i_size = nbd->bytesize;
  114. set_capacity(nbd->disk, nbd->bytesize >> 9);
  115. kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
  116. }
  117. static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev,
  118. int blocksize, int nr_blocks)
  119. {
  120. int ret;
  121. ret = set_blocksize(bdev, blocksize);
  122. if (ret)
  123. return ret;
  124. nbd->blksize = blocksize;
  125. nbd->bytesize = (loff_t)blocksize * (loff_t)nr_blocks;
  126. nbd_size_update(nbd, bdev);
  127. return 0;
  128. }
  129. static void nbd_end_request(struct nbd_device *nbd, struct request *req)
  130. {
  131. int error = req->errors ? -EIO : 0;
  132. struct request_queue *q = req->q;
  133. unsigned long flags;
  134. dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req,
  135. error ? "failed" : "done");
  136. spin_lock_irqsave(q->queue_lock, flags);
  137. __blk_end_request_all(req, error);
  138. spin_unlock_irqrestore(q->queue_lock, flags);
  139. }
  140. /*
  141. * Forcibly shutdown the socket causing all listeners to error
  142. */
  143. static void sock_shutdown(struct nbd_device *nbd)
  144. {
  145. spin_lock_irq(&nbd->sock_lock);
  146. if (!nbd->sock) {
  147. spin_unlock_irq(&nbd->sock_lock);
  148. return;
  149. }
  150. dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
  151. kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
  152. sockfd_put(nbd->sock);
  153. nbd->sock = NULL;
  154. spin_unlock_irq(&nbd->sock_lock);
  155. del_timer(&nbd->timeout_timer);
  156. }
  157. static void nbd_xmit_timeout(unsigned long arg)
  158. {
  159. struct nbd_device *nbd = (struct nbd_device *)arg;
  160. unsigned long flags;
  161. if (list_empty(&nbd->queue_head))
  162. return;
  163. spin_lock_irqsave(&nbd->sock_lock, flags);
  164. nbd->timedout = true;
  165. if (nbd->sock)
  166. kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
  167. spin_unlock_irqrestore(&nbd->sock_lock, flags);
  168. dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n");
  169. }
  170. /*
  171. * Send or receive packet.
  172. */
  173. static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
  174. int msg_flags)
  175. {
  176. struct socket *sock = nbd->sock;
  177. int result;
  178. struct msghdr msg;
  179. struct kvec iov;
  180. unsigned long pflags = current->flags;
  181. if (unlikely(!sock)) {
  182. dev_err(disk_to_dev(nbd->disk),
  183. "Attempted %s on closed socket in sock_xmit\n",
  184. (send ? "send" : "recv"));
  185. return -EINVAL;
  186. }
  187. current->flags |= PF_MEMALLOC;
  188. do {
  189. sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
  190. iov.iov_base = buf;
  191. iov.iov_len = size;
  192. msg.msg_name = NULL;
  193. msg.msg_namelen = 0;
  194. msg.msg_control = NULL;
  195. msg.msg_controllen = 0;
  196. msg.msg_flags = msg_flags | MSG_NOSIGNAL;
  197. if (send)
  198. result = kernel_sendmsg(sock, &msg, &iov, 1, size);
  199. else
  200. result = kernel_recvmsg(sock, &msg, &iov, 1, size,
  201. msg.msg_flags);
  202. if (result <= 0) {
  203. if (result == 0)
  204. result = -EPIPE; /* short read */
  205. break;
  206. }
  207. size -= result;
  208. buf += result;
  209. } while (size > 0);
  210. tsk_restore_flags(current, pflags, PF_MEMALLOC);
  211. if (!send && nbd->xmit_timeout)
  212. mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
  213. return result;
  214. }
  215. static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
  216. int flags)
  217. {
  218. int result;
  219. void *kaddr = kmap(bvec->bv_page);
  220. result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
  221. bvec->bv_len, flags);
  222. kunmap(bvec->bv_page);
  223. return result;
  224. }
  225. /* always call with the tx_lock held */
  226. static int nbd_send_req(struct nbd_device *nbd, struct request *req)
  227. {
  228. int result, flags;
  229. struct nbd_request request;
  230. unsigned long size = blk_rq_bytes(req);
  231. u32 type;
  232. if (req->cmd_type == REQ_TYPE_DRV_PRIV)
  233. type = NBD_CMD_DISC;
  234. else if (req->cmd_flags & REQ_DISCARD)
  235. type = NBD_CMD_TRIM;
  236. else if (req->cmd_flags & REQ_FLUSH)
  237. type = NBD_CMD_FLUSH;
  238. else if (rq_data_dir(req) == WRITE)
  239. type = NBD_CMD_WRITE;
  240. else
  241. type = NBD_CMD_READ;
  242. memset(&request, 0, sizeof(request));
  243. request.magic = htonl(NBD_REQUEST_MAGIC);
  244. request.type = htonl(type);
  245. if (type != NBD_CMD_FLUSH && type != NBD_CMD_DISC) {
  246. request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
  247. request.len = htonl(size);
  248. }
  249. memcpy(request.handle, &req, sizeof(req));
  250. dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
  251. req, nbdcmd_to_ascii(type),
  252. (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
  253. result = sock_xmit(nbd, 1, &request, sizeof(request),
  254. (type == NBD_CMD_WRITE) ? MSG_MORE : 0);
  255. if (result <= 0) {
  256. dev_err(disk_to_dev(nbd->disk),
  257. "Send control failed (result %d)\n", result);
  258. return -EIO;
  259. }
  260. if (type == NBD_CMD_WRITE) {
  261. struct req_iterator iter;
  262. struct bio_vec bvec;
  263. /*
  264. * we are really probing at internals to determine
  265. * whether to set MSG_MORE or not...
  266. */
  267. rq_for_each_segment(bvec, req, iter) {
  268. flags = 0;
  269. if (!rq_iter_last(bvec, iter))
  270. flags = MSG_MORE;
  271. dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
  272. req, bvec.bv_len);
  273. result = sock_send_bvec(nbd, &bvec, flags);
  274. if (result <= 0) {
  275. dev_err(disk_to_dev(nbd->disk),
  276. "Send data failed (result %d)\n",
  277. result);
  278. return -EIO;
  279. }
  280. }
  281. }
  282. return 0;
  283. }
  284. static struct request *nbd_find_request(struct nbd_device *nbd,
  285. struct request *xreq)
  286. {
  287. struct request *req, *tmp;
  288. int err;
  289. err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
  290. if (unlikely(err))
  291. return ERR_PTR(err);
  292. spin_lock(&nbd->queue_lock);
  293. list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
  294. if (req != xreq)
  295. continue;
  296. list_del_init(&req->queuelist);
  297. spin_unlock(&nbd->queue_lock);
  298. return req;
  299. }
  300. spin_unlock(&nbd->queue_lock);
  301. return ERR_PTR(-ENOENT);
  302. }
  303. static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
  304. {
  305. int result;
  306. void *kaddr = kmap(bvec->bv_page);
  307. result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
  308. MSG_WAITALL);
  309. kunmap(bvec->bv_page);
  310. return result;
  311. }
  312. /* NULL returned = something went wrong, inform userspace */
  313. static struct request *nbd_read_stat(struct nbd_device *nbd)
  314. {
  315. int result;
  316. struct nbd_reply reply;
  317. struct request *req;
  318. reply.magic = 0;
  319. result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
  320. if (result <= 0) {
  321. dev_err(disk_to_dev(nbd->disk),
  322. "Receive control failed (result %d)\n", result);
  323. return ERR_PTR(result);
  324. }
  325. if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
  326. dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
  327. (unsigned long)ntohl(reply.magic));
  328. return ERR_PTR(-EPROTO);
  329. }
  330. req = nbd_find_request(nbd, *(struct request **)reply.handle);
  331. if (IS_ERR(req)) {
  332. result = PTR_ERR(req);
  333. if (result != -ENOENT)
  334. return ERR_PTR(result);
  335. dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
  336. reply.handle);
  337. return ERR_PTR(-EBADR);
  338. }
  339. if (ntohl(reply.error)) {
  340. dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
  341. ntohl(reply.error));
  342. req->errors++;
  343. return req;
  344. }
  345. dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
  346. if (rq_data_dir(req) != WRITE) {
  347. struct req_iterator iter;
  348. struct bio_vec bvec;
  349. rq_for_each_segment(bvec, req, iter) {
  350. result = sock_recv_bvec(nbd, &bvec);
  351. if (result <= 0) {
  352. dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
  353. result);
  354. req->errors++;
  355. return req;
  356. }
  357. dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
  358. req, bvec.bv_len);
  359. }
  360. }
  361. return req;
  362. }
  363. static ssize_t pid_show(struct device *dev,
  364. struct device_attribute *attr, char *buf)
  365. {
  366. struct gendisk *disk = dev_to_disk(dev);
  367. struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
  368. return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
  369. }
  370. static struct device_attribute pid_attr = {
  371. .attr = { .name = "pid", .mode = S_IRUGO},
  372. .show = pid_show,
  373. };
  374. static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev)
  375. {
  376. struct request *req;
  377. int ret;
  378. BUG_ON(nbd->magic != NBD_MAGIC);
  379. sk_set_memalloc(nbd->sock->sk);
  380. nbd->task_recv = current;
  381. ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
  382. if (ret) {
  383. dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
  384. nbd->task_recv = NULL;
  385. return ret;
  386. }
  387. nbd_size_update(nbd, bdev);
  388. while (1) {
  389. req = nbd_read_stat(nbd);
  390. if (IS_ERR(req)) {
  391. ret = PTR_ERR(req);
  392. break;
  393. }
  394. nbd_end_request(nbd, req);
  395. }
  396. nbd_size_clear(nbd, bdev);
  397. device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
  398. nbd->task_recv = NULL;
  399. return ret;
  400. }
  401. static void nbd_clear_que(struct nbd_device *nbd)
  402. {
  403. struct request *req;
  404. BUG_ON(nbd->magic != NBD_MAGIC);
  405. /*
  406. * Because we have set nbd->sock to NULL under the tx_lock, all
  407. * modifications to the list must have completed by now. For
  408. * the same reason, the active_req must be NULL.
  409. *
  410. * As a consequence, we don't need to take the spin lock while
  411. * purging the list here.
  412. */
  413. BUG_ON(nbd->sock);
  414. BUG_ON(nbd->active_req);
  415. while (!list_empty(&nbd->queue_head)) {
  416. req = list_entry(nbd->queue_head.next, struct request,
  417. queuelist);
  418. list_del_init(&req->queuelist);
  419. req->errors++;
  420. nbd_end_request(nbd, req);
  421. }
  422. while (!list_empty(&nbd->waiting_queue)) {
  423. req = list_entry(nbd->waiting_queue.next, struct request,
  424. queuelist);
  425. list_del_init(&req->queuelist);
  426. req->errors++;
  427. nbd_end_request(nbd, req);
  428. }
  429. dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
  430. }
  431. static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
  432. {
  433. if (req->cmd_type != REQ_TYPE_FS)
  434. goto error_out;
  435. if (rq_data_dir(req) == WRITE &&
  436. (nbd->flags & NBD_FLAG_READ_ONLY)) {
  437. dev_err(disk_to_dev(nbd->disk),
  438. "Write on read-only\n");
  439. goto error_out;
  440. }
  441. req->errors = 0;
  442. mutex_lock(&nbd->tx_lock);
  443. if (unlikely(!nbd->sock)) {
  444. mutex_unlock(&nbd->tx_lock);
  445. dev_err(disk_to_dev(nbd->disk),
  446. "Attempted send on closed socket\n");
  447. goto error_out;
  448. }
  449. nbd->active_req = req;
  450. if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head))
  451. mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout);
  452. if (nbd_send_req(nbd, req) != 0) {
  453. dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
  454. req->errors++;
  455. nbd_end_request(nbd, req);
  456. } else {
  457. spin_lock(&nbd->queue_lock);
  458. list_add_tail(&req->queuelist, &nbd->queue_head);
  459. spin_unlock(&nbd->queue_lock);
  460. }
  461. nbd->active_req = NULL;
  462. mutex_unlock(&nbd->tx_lock);
  463. wake_up_all(&nbd->active_wq);
  464. return;
  465. error_out:
  466. req->errors++;
  467. nbd_end_request(nbd, req);
  468. }
  469. static int nbd_thread_send(void *data)
  470. {
  471. struct nbd_device *nbd = data;
  472. struct request *req;
  473. nbd->task_send = current;
  474. set_user_nice(current, MIN_NICE);
  475. while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
  476. /* wait for something to do */
  477. wait_event_interruptible(nbd->waiting_wq,
  478. kthread_should_stop() ||
  479. !list_empty(&nbd->waiting_queue));
  480. /* extract request */
  481. if (list_empty(&nbd->waiting_queue))
  482. continue;
  483. spin_lock_irq(&nbd->queue_lock);
  484. req = list_entry(nbd->waiting_queue.next, struct request,
  485. queuelist);
  486. list_del_init(&req->queuelist);
  487. spin_unlock_irq(&nbd->queue_lock);
  488. /* handle request */
  489. nbd_handle_req(nbd, req);
  490. }
  491. nbd->task_send = NULL;
  492. return 0;
  493. }
  494. /*
  495. * We always wait for result of write, for now. It would be nice to make it optional
  496. * in future
  497. * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
  498. * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
  499. */
  500. static void nbd_request_handler(struct request_queue *q)
  501. __releases(q->queue_lock) __acquires(q->queue_lock)
  502. {
  503. struct request *req;
  504. while ((req = blk_fetch_request(q)) != NULL) {
  505. struct nbd_device *nbd;
  506. spin_unlock_irq(q->queue_lock);
  507. nbd = req->rq_disk->private_data;
  508. BUG_ON(nbd->magic != NBD_MAGIC);
  509. dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n",
  510. req, req->cmd_type);
  511. if (unlikely(!nbd->sock)) {
  512. dev_err_ratelimited(disk_to_dev(nbd->disk),
  513. "Attempted send on closed socket\n");
  514. req->errors++;
  515. nbd_end_request(nbd, req);
  516. spin_lock_irq(q->queue_lock);
  517. continue;
  518. }
  519. spin_lock_irq(&nbd->queue_lock);
  520. list_add_tail(&req->queuelist, &nbd->waiting_queue);
  521. spin_unlock_irq(&nbd->queue_lock);
  522. wake_up(&nbd->waiting_wq);
  523. spin_lock_irq(q->queue_lock);
  524. }
  525. }
  526. static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock)
  527. {
  528. int ret = 0;
  529. spin_lock_irq(&nbd->sock_lock);
  530. if (nbd->sock) {
  531. ret = -EBUSY;
  532. goto out;
  533. }
  534. nbd->sock = sock;
  535. out:
  536. spin_unlock_irq(&nbd->sock_lock);
  537. return ret;
  538. }
  539. /* Reset all properties of an NBD device */
  540. static void nbd_reset(struct nbd_device *nbd)
  541. {
  542. nbd->disconnect = false;
  543. nbd->timedout = false;
  544. nbd->blksize = 1024;
  545. nbd->bytesize = 0;
  546. set_capacity(nbd->disk, 0);
  547. nbd->flags = 0;
  548. nbd->xmit_timeout = 0;
  549. queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
  550. del_timer_sync(&nbd->timeout_timer);
  551. }
  552. static void nbd_bdev_reset(struct block_device *bdev)
  553. {
  554. set_device_ro(bdev, false);
  555. bdev->bd_inode->i_size = 0;
  556. if (max_part > 0) {
  557. blkdev_reread_part(bdev);
  558. bdev->bd_invalidated = 1;
  559. }
  560. }
  561. static void nbd_parse_flags(struct nbd_device *nbd, struct block_device *bdev)
  562. {
  563. if (nbd->flags & NBD_FLAG_READ_ONLY)
  564. set_device_ro(bdev, true);
  565. if (nbd->flags & NBD_FLAG_SEND_TRIM)
  566. queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue);
  567. if (nbd->flags & NBD_FLAG_SEND_FLUSH)
  568. blk_queue_flush(nbd->disk->queue, REQ_FLUSH);
  569. else
  570. blk_queue_flush(nbd->disk->queue, 0);
  571. }
  572. static int nbd_dev_dbg_init(struct nbd_device *nbd);
  573. static void nbd_dev_dbg_close(struct nbd_device *nbd);
  574. /* Must be called with tx_lock held */
  575. static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
  576. unsigned int cmd, unsigned long arg)
  577. {
  578. switch (cmd) {
  579. case NBD_DISCONNECT: {
  580. struct request sreq;
  581. dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
  582. if (!nbd->sock)
  583. return -EINVAL;
  584. mutex_unlock(&nbd->tx_lock);
  585. fsync_bdev(bdev);
  586. mutex_lock(&nbd->tx_lock);
  587. blk_rq_init(NULL, &sreq);
  588. sreq.cmd_type = REQ_TYPE_DRV_PRIV;
  589. /* Check again after getting mutex back. */
  590. if (!nbd->sock)
  591. return -EINVAL;
  592. nbd->disconnect = true;
  593. nbd_send_req(nbd, &sreq);
  594. return 0;
  595. }
  596. case NBD_CLEAR_SOCK:
  597. sock_shutdown(nbd);
  598. nbd_clear_que(nbd);
  599. BUG_ON(!list_empty(&nbd->queue_head));
  600. BUG_ON(!list_empty(&nbd->waiting_queue));
  601. kill_bdev(bdev);
  602. return 0;
  603. case NBD_SET_SOCK: {
  604. int err;
  605. struct socket *sock = sockfd_lookup(arg, &err);
  606. if (!sock)
  607. return err;
  608. err = nbd_set_socket(nbd, sock);
  609. if (!err && max_part)
  610. bdev->bd_invalidated = 1;
  611. return err;
  612. }
  613. case NBD_SET_BLKSIZE: {
  614. loff_t bsize = div_s64(nbd->bytesize, arg);
  615. return nbd_size_set(nbd, bdev, arg, bsize);
  616. }
  617. case NBD_SET_SIZE:
  618. return nbd_size_set(nbd, bdev, nbd->blksize,
  619. arg / nbd->blksize);
  620. case NBD_SET_SIZE_BLOCKS:
  621. return nbd_size_set(nbd, bdev, nbd->blksize, arg);
  622. case NBD_SET_TIMEOUT:
  623. nbd->xmit_timeout = arg * HZ;
  624. if (arg)
  625. mod_timer(&nbd->timeout_timer,
  626. jiffies + nbd->xmit_timeout);
  627. else
  628. del_timer_sync(&nbd->timeout_timer);
  629. return 0;
  630. case NBD_SET_FLAGS:
  631. nbd->flags = arg;
  632. return 0;
  633. case NBD_DO_IT: {
  634. struct task_struct *thread;
  635. int error;
  636. if (nbd->task_recv)
  637. return -EBUSY;
  638. if (!nbd->sock)
  639. return -EINVAL;
  640. mutex_unlock(&nbd->tx_lock);
  641. nbd_parse_flags(nbd, bdev);
  642. thread = kthread_run(nbd_thread_send, nbd, "%s",
  643. nbd_name(nbd));
  644. if (IS_ERR(thread)) {
  645. mutex_lock(&nbd->tx_lock);
  646. return PTR_ERR(thread);
  647. }
  648. nbd_dev_dbg_init(nbd);
  649. error = nbd_thread_recv(nbd, bdev);
  650. nbd_dev_dbg_close(nbd);
  651. kthread_stop(thread);
  652. mutex_lock(&nbd->tx_lock);
  653. sock_shutdown(nbd);
  654. nbd_clear_que(nbd);
  655. kill_bdev(bdev);
  656. nbd_bdev_reset(bdev);
  657. if (nbd->disconnect) /* user requested, ignore socket errors */
  658. error = 0;
  659. if (nbd->timedout)
  660. error = -ETIMEDOUT;
  661. nbd_reset(nbd);
  662. return error;
  663. }
  664. case NBD_CLEAR_QUE:
  665. /*
  666. * This is for compatibility only. The queue is always cleared
  667. * by NBD_DO_IT or NBD_CLEAR_SOCK.
  668. */
  669. return 0;
  670. case NBD_PRINT_DEBUG:
  671. dev_info(disk_to_dev(nbd->disk),
  672. "next = %p, prev = %p, head = %p\n",
  673. nbd->queue_head.next, nbd->queue_head.prev,
  674. &nbd->queue_head);
  675. return 0;
  676. }
  677. return -ENOTTY;
  678. }
  679. static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
  680. unsigned int cmd, unsigned long arg)
  681. {
  682. struct nbd_device *nbd = bdev->bd_disk->private_data;
  683. int error;
  684. if (!capable(CAP_SYS_ADMIN))
  685. return -EPERM;
  686. BUG_ON(nbd->magic != NBD_MAGIC);
  687. mutex_lock(&nbd->tx_lock);
  688. error = __nbd_ioctl(bdev, nbd, cmd, arg);
  689. mutex_unlock(&nbd->tx_lock);
  690. return error;
  691. }
  692. static const struct block_device_operations nbd_fops =
  693. {
  694. .owner = THIS_MODULE,
  695. .ioctl = nbd_ioctl,
  696. .compat_ioctl = nbd_ioctl,
  697. };
  698. #if IS_ENABLED(CONFIG_DEBUG_FS)
  699. static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
  700. {
  701. struct nbd_device *nbd = s->private;
  702. if (nbd->task_recv)
  703. seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
  704. if (nbd->task_send)
  705. seq_printf(s, "send: %d\n", task_pid_nr(nbd->task_send));
  706. return 0;
  707. }
  708. static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
  709. {
  710. return single_open(file, nbd_dbg_tasks_show, inode->i_private);
  711. }
  712. static const struct file_operations nbd_dbg_tasks_ops = {
  713. .open = nbd_dbg_tasks_open,
  714. .read = seq_read,
  715. .llseek = seq_lseek,
  716. .release = single_release,
  717. };
  718. static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
  719. {
  720. struct nbd_device *nbd = s->private;
  721. u32 flags = nbd->flags;
  722. seq_printf(s, "Hex: 0x%08x\n\n", flags);
  723. seq_puts(s, "Known flags:\n");
  724. if (flags & NBD_FLAG_HAS_FLAGS)
  725. seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
  726. if (flags & NBD_FLAG_READ_ONLY)
  727. seq_puts(s, "NBD_FLAG_READ_ONLY\n");
  728. if (flags & NBD_FLAG_SEND_FLUSH)
  729. seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
  730. if (flags & NBD_FLAG_SEND_TRIM)
  731. seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
  732. return 0;
  733. }
  734. static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
  735. {
  736. return single_open(file, nbd_dbg_flags_show, inode->i_private);
  737. }
  738. static const struct file_operations nbd_dbg_flags_ops = {
  739. .open = nbd_dbg_flags_open,
  740. .read = seq_read,
  741. .llseek = seq_lseek,
  742. .release = single_release,
  743. };
  744. static int nbd_dev_dbg_init(struct nbd_device *nbd)
  745. {
  746. struct dentry *dir;
  747. if (!nbd_dbg_dir)
  748. return -EIO;
  749. dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
  750. if (!dir) {
  751. dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
  752. nbd_name(nbd));
  753. return -EIO;
  754. }
  755. nbd->dbg_dir = dir;
  756. debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
  757. debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize);
  758. debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout);
  759. debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize);
  760. debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops);
  761. return 0;
  762. }
  763. static void nbd_dev_dbg_close(struct nbd_device *nbd)
  764. {
  765. debugfs_remove_recursive(nbd->dbg_dir);
  766. }
  767. static int nbd_dbg_init(void)
  768. {
  769. struct dentry *dbg_dir;
  770. dbg_dir = debugfs_create_dir("nbd", NULL);
  771. if (!dbg_dir)
  772. return -EIO;
  773. nbd_dbg_dir = dbg_dir;
  774. return 0;
  775. }
  776. static void nbd_dbg_close(void)
  777. {
  778. debugfs_remove_recursive(nbd_dbg_dir);
  779. }
  780. #else /* IS_ENABLED(CONFIG_DEBUG_FS) */
  781. static int nbd_dev_dbg_init(struct nbd_device *nbd)
  782. {
  783. return 0;
  784. }
  785. static void nbd_dev_dbg_close(struct nbd_device *nbd)
  786. {
  787. }
  788. static int nbd_dbg_init(void)
  789. {
  790. return 0;
  791. }
  792. static void nbd_dbg_close(void)
  793. {
  794. }
  795. #endif
  796. /*
  797. * And here should be modules and kernel interface
  798. * (Just smiley confuses emacs :-)
  799. */
  800. static int __init nbd_init(void)
  801. {
  802. int err = -ENOMEM;
  803. int i;
  804. int part_shift;
  805. BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
  806. if (max_part < 0) {
  807. printk(KERN_ERR "nbd: max_part must be >= 0\n");
  808. return -EINVAL;
  809. }
  810. part_shift = 0;
  811. if (max_part > 0) {
  812. part_shift = fls(max_part);
  813. /*
  814. * Adjust max_part according to part_shift as it is exported
  815. * to user space so that user can know the max number of
  816. * partition kernel should be able to manage.
  817. *
  818. * Note that -1 is required because partition 0 is reserved
  819. * for the whole disk.
  820. */
  821. max_part = (1UL << part_shift) - 1;
  822. }
  823. if ((1UL << part_shift) > DISK_MAX_PARTS)
  824. return -EINVAL;
  825. if (nbds_max > 1UL << (MINORBITS - part_shift))
  826. return -EINVAL;
  827. nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
  828. if (!nbd_dev)
  829. return -ENOMEM;
  830. for (i = 0; i < nbds_max; i++) {
  831. struct gendisk *disk = alloc_disk(1 << part_shift);
  832. if (!disk)
  833. goto out;
  834. nbd_dev[i].disk = disk;
  835. /*
  836. * The new linux 2.5 block layer implementation requires
  837. * every gendisk to have its very own request_queue struct.
  838. * These structs are big so we dynamically allocate them.
  839. */
  840. disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock);
  841. if (!disk->queue) {
  842. put_disk(disk);
  843. goto out;
  844. }
  845. /*
  846. * Tell the block layer that we are not a rotational device
  847. */
  848. queue_flag_set_unlocked(QUEUE_FLAG_NONROT, disk->queue);
  849. queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, disk->queue);
  850. disk->queue->limits.discard_granularity = 512;
  851. blk_queue_max_discard_sectors(disk->queue, UINT_MAX);
  852. disk->queue->limits.discard_zeroes_data = 0;
  853. blk_queue_max_hw_sectors(disk->queue, 65536);
  854. disk->queue->limits.max_sectors = 256;
  855. }
  856. if (register_blkdev(NBD_MAJOR, "nbd")) {
  857. err = -EIO;
  858. goto out;
  859. }
  860. printk(KERN_INFO "nbd: registered device at major %d\n", NBD_MAJOR);
  861. nbd_dbg_init();
  862. for (i = 0; i < nbds_max; i++) {
  863. struct gendisk *disk = nbd_dev[i].disk;
  864. nbd_dev[i].magic = NBD_MAGIC;
  865. INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
  866. spin_lock_init(&nbd_dev[i].queue_lock);
  867. spin_lock_init(&nbd_dev[i].sock_lock);
  868. INIT_LIST_HEAD(&nbd_dev[i].queue_head);
  869. mutex_init(&nbd_dev[i].tx_lock);
  870. init_timer(&nbd_dev[i].timeout_timer);
  871. nbd_dev[i].timeout_timer.function = nbd_xmit_timeout;
  872. nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i];
  873. init_waitqueue_head(&nbd_dev[i].active_wq);
  874. init_waitqueue_head(&nbd_dev[i].waiting_wq);
  875. disk->major = NBD_MAJOR;
  876. disk->first_minor = i << part_shift;
  877. disk->fops = &nbd_fops;
  878. disk->private_data = &nbd_dev[i];
  879. sprintf(disk->disk_name, "nbd%d", i);
  880. nbd_reset(&nbd_dev[i]);
  881. add_disk(disk);
  882. }
  883. return 0;
  884. out:
  885. while (i--) {
  886. blk_cleanup_queue(nbd_dev[i].disk->queue);
  887. put_disk(nbd_dev[i].disk);
  888. }
  889. kfree(nbd_dev);
  890. return err;
  891. }
  892. static void __exit nbd_cleanup(void)
  893. {
  894. int i;
  895. nbd_dbg_close();
  896. for (i = 0; i < nbds_max; i++) {
  897. struct gendisk *disk = nbd_dev[i].disk;
  898. nbd_dev[i].magic = 0;
  899. if (disk) {
  900. del_gendisk(disk);
  901. blk_cleanup_queue(disk->queue);
  902. put_disk(disk);
  903. }
  904. }
  905. unregister_blkdev(NBD_MAJOR, "nbd");
  906. kfree(nbd_dev);
  907. printk(KERN_INFO "nbd: unregistered device at major %d\n", NBD_MAJOR);
  908. }
  909. module_init(nbd_init);
  910. module_exit(nbd_cleanup);
  911. MODULE_DESCRIPTION("Network Block Device");
  912. MODULE_LICENSE("GPL");
  913. module_param(nbds_max, int, 0444);
  914. MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
  915. module_param(max_part, int, 0444);
  916. MODULE_PARM_DESC(max_part, "number of partitions per device (default: 0)");