read_write.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286
  1. /*
  2. * linux/fs/read_write.c
  3. *
  4. * Copyright (C) 1991, 1992 Linus Torvalds
  5. */
  6. #include <linux/slab.h>
  7. #include <linux/stat.h>
  8. #include <linux/fcntl.h>
  9. #include <linux/file.h>
  10. #include <linux/uio.h>
  11. #include <linux/aio.h>
  12. #include <linux/fsnotify.h>
  13. #include <linux/security.h>
  14. #include <linux/export.h>
  15. #include <linux/syscalls.h>
  16. #include <linux/pagemap.h>
  17. #include <linux/splice.h>
  18. #include <linux/compat.h>
  19. #include "internal.h"
  20. #include <asm/uaccess.h>
  21. #include <asm/unistd.h>
  22. typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
  23. typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
  24. unsigned long, loff_t);
  25. const struct file_operations generic_ro_fops = {
  26. .llseek = generic_file_llseek,
  27. .read = do_sync_read,
  28. .aio_read = generic_file_aio_read,
  29. .mmap = generic_file_readonly_mmap,
  30. .splice_read = generic_file_splice_read,
  31. };
  32. EXPORT_SYMBOL(generic_ro_fops);
  33. static inline int unsigned_offsets(struct file *file)
  34. {
  35. return file->f_mode & FMODE_UNSIGNED_OFFSET;
  36. }
  37. /**
  38. * vfs_setpos - update the file offset for lseek
  39. * @file: file structure in question
  40. * @offset: file offset to seek to
  41. * @maxsize: maximum file size
  42. *
  43. * This is a low-level filesystem helper for updating the file offset to
  44. * the value specified by @offset if the given offset is valid and it is
  45. * not equal to the current file offset.
  46. *
  47. * Return the specified offset on success and -EINVAL on invalid offset.
  48. */
  49. loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize)
  50. {
  51. if (offset < 0 && !unsigned_offsets(file))
  52. return -EINVAL;
  53. if (offset > maxsize)
  54. return -EINVAL;
  55. if (offset != file->f_pos) {
  56. file->f_pos = offset;
  57. file->f_version = 0;
  58. }
  59. return offset;
  60. }
  61. EXPORT_SYMBOL(vfs_setpos);
  62. /**
  63. * generic_file_llseek_size - generic llseek implementation for regular files
  64. * @file: file structure to seek on
  65. * @offset: file offset to seek to
  66. * @whence: type of seek
  67. * @size: max size of this file in file system
  68. * @eof: offset used for SEEK_END position
  69. *
  70. * This is a variant of generic_file_llseek that allows passing in a custom
  71. * maximum file size and a custom EOF position, for e.g. hashed directories
  72. *
  73. * Synchronization:
  74. * SEEK_SET and SEEK_END are unsynchronized (but atomic on 64bit platforms)
  75. * SEEK_CUR is synchronized against other SEEK_CURs, but not read/writes.
  76. * read/writes behave like SEEK_SET against seeks.
  77. */
  78. loff_t
  79. generic_file_llseek_size(struct file *file, loff_t offset, int whence,
  80. loff_t maxsize, loff_t eof)
  81. {
  82. switch (whence) {
  83. case SEEK_END:
  84. offset += eof;
  85. break;
  86. case SEEK_CUR:
  87. /*
  88. * Here we special-case the lseek(fd, 0, SEEK_CUR)
  89. * position-querying operation. Avoid rewriting the "same"
  90. * f_pos value back to the file because a concurrent read(),
  91. * write() or lseek() might have altered it
  92. */
  93. if (offset == 0)
  94. return file->f_pos;
  95. /*
  96. * f_lock protects against read/modify/write race with other
  97. * SEEK_CURs. Note that parallel writes and reads behave
  98. * like SEEK_SET.
  99. */
  100. spin_lock(&file->f_lock);
  101. offset = vfs_setpos(file, file->f_pos + offset, maxsize);
  102. spin_unlock(&file->f_lock);
  103. return offset;
  104. case SEEK_DATA:
  105. /*
  106. * In the generic case the entire file is data, so as long as
  107. * offset isn't at the end of the file then the offset is data.
  108. */
  109. if (offset >= eof)
  110. return -ENXIO;
  111. break;
  112. case SEEK_HOLE:
  113. /*
  114. * There is a virtual hole at the end of the file, so as long as
  115. * offset isn't i_size or larger, return i_size.
  116. */
  117. if (offset >= eof)
  118. return -ENXIO;
  119. offset = eof;
  120. break;
  121. }
  122. return vfs_setpos(file, offset, maxsize);
  123. }
  124. EXPORT_SYMBOL(generic_file_llseek_size);
  125. /**
  126. * generic_file_llseek - generic llseek implementation for regular files
  127. * @file: file structure to seek on
  128. * @offset: file offset to seek to
  129. * @whence: type of seek
  130. *
  131. * This is a generic implemenation of ->llseek useable for all normal local
  132. * filesystems. It just updates the file offset to the value specified by
  133. * @offset and @whence.
  134. */
  135. loff_t generic_file_llseek(struct file *file, loff_t offset, int whence)
  136. {
  137. struct inode *inode = file->f_mapping->host;
  138. return generic_file_llseek_size(file, offset, whence,
  139. inode->i_sb->s_maxbytes,
  140. i_size_read(inode));
  141. }
  142. EXPORT_SYMBOL(generic_file_llseek);
  143. /**
  144. * fixed_size_llseek - llseek implementation for fixed-sized devices
  145. * @file: file structure to seek on
  146. * @offset: file offset to seek to
  147. * @whence: type of seek
  148. * @size: size of the file
  149. *
  150. */
  151. loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size)
  152. {
  153. switch (whence) {
  154. case SEEK_SET: case SEEK_CUR: case SEEK_END:
  155. return generic_file_llseek_size(file, offset, whence,
  156. size, size);
  157. default:
  158. return -EINVAL;
  159. }
  160. }
  161. EXPORT_SYMBOL(fixed_size_llseek);
  162. /**
  163. * noop_llseek - No Operation Performed llseek implementation
  164. * @file: file structure to seek on
  165. * @offset: file offset to seek to
  166. * @whence: type of seek
  167. *
  168. * This is an implementation of ->llseek useable for the rare special case when
  169. * userspace expects the seek to succeed but the (device) file is actually not
  170. * able to perform the seek. In this case you use noop_llseek() instead of
  171. * falling back to the default implementation of ->llseek.
  172. */
  173. loff_t noop_llseek(struct file *file, loff_t offset, int whence)
  174. {
  175. return file->f_pos;
  176. }
  177. EXPORT_SYMBOL(noop_llseek);
  178. loff_t no_llseek(struct file *file, loff_t offset, int whence)
  179. {
  180. return -ESPIPE;
  181. }
  182. EXPORT_SYMBOL(no_llseek);
  183. loff_t default_llseek(struct file *file, loff_t offset, int whence)
  184. {
  185. struct inode *inode = file_inode(file);
  186. loff_t retval;
  187. mutex_lock(&inode->i_mutex);
  188. switch (whence) {
  189. case SEEK_END:
  190. offset += i_size_read(inode);
  191. break;
  192. case SEEK_CUR:
  193. if (offset == 0) {
  194. retval = file->f_pos;
  195. goto out;
  196. }
  197. offset += file->f_pos;
  198. break;
  199. case SEEK_DATA:
  200. /*
  201. * In the generic case the entire file is data, so as
  202. * long as offset isn't at the end of the file then the
  203. * offset is data.
  204. */
  205. if (offset >= inode->i_size) {
  206. retval = -ENXIO;
  207. goto out;
  208. }
  209. break;
  210. case SEEK_HOLE:
  211. /*
  212. * There is a virtual hole at the end of the file, so
  213. * as long as offset isn't i_size or larger, return
  214. * i_size.
  215. */
  216. if (offset >= inode->i_size) {
  217. retval = -ENXIO;
  218. goto out;
  219. }
  220. offset = inode->i_size;
  221. break;
  222. }
  223. retval = -EINVAL;
  224. if (offset >= 0 || unsigned_offsets(file)) {
  225. if (offset != file->f_pos) {
  226. file->f_pos = offset;
  227. file->f_version = 0;
  228. }
  229. retval = offset;
  230. }
  231. out:
  232. mutex_unlock(&inode->i_mutex);
  233. return retval;
  234. }
  235. EXPORT_SYMBOL(default_llseek);
  236. loff_t vfs_llseek(struct file *file, loff_t offset, int whence)
  237. {
  238. loff_t (*fn)(struct file *, loff_t, int);
  239. fn = no_llseek;
  240. if (file->f_mode & FMODE_LSEEK) {
  241. if (file->f_op->llseek)
  242. fn = file->f_op->llseek;
  243. }
  244. return fn(file, offset, whence);
  245. }
  246. EXPORT_SYMBOL(vfs_llseek);
  247. static inline struct fd fdget_pos(int fd)
  248. {
  249. return __to_fd(__fdget_pos(fd));
  250. }
  251. static inline void fdput_pos(struct fd f)
  252. {
  253. if (f.flags & FDPUT_POS_UNLOCK)
  254. mutex_unlock(&f.file->f_pos_lock);
  255. fdput(f);
  256. }
  257. SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence)
  258. {
  259. off_t retval;
  260. struct fd f = fdget_pos(fd);
  261. if (!f.file)
  262. return -EBADF;
  263. retval = -EINVAL;
  264. if (whence <= SEEK_MAX) {
  265. loff_t res = vfs_llseek(f.file, offset, whence);
  266. retval = res;
  267. if (res != (loff_t)retval)
  268. retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
  269. }
  270. fdput_pos(f);
  271. return retval;
  272. }
  273. #ifdef CONFIG_COMPAT
  274. COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence)
  275. {
  276. return sys_lseek(fd, offset, whence);
  277. }
  278. #endif
  279. #ifdef __ARCH_WANT_SYS_LLSEEK
  280. SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
  281. unsigned long, offset_low, loff_t __user *, result,
  282. unsigned int, whence)
  283. {
  284. int retval;
  285. struct fd f = fdget_pos(fd);
  286. loff_t offset;
  287. if (!f.file)
  288. return -EBADF;
  289. retval = -EINVAL;
  290. if (whence > SEEK_MAX)
  291. goto out_putf;
  292. offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
  293. whence);
  294. retval = (int)offset;
  295. if (offset >= 0) {
  296. retval = -EFAULT;
  297. if (!copy_to_user(result, &offset, sizeof(offset)))
  298. retval = 0;
  299. }
  300. out_putf:
  301. fdput_pos(f);
  302. return retval;
  303. }
  304. #endif
  305. /*
  306. * rw_verify_area doesn't like huge counts. We limit
  307. * them to something that fits in "int" so that others
  308. * won't have to do range checks all the time.
  309. */
  310. int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
  311. {
  312. struct inode *inode;
  313. loff_t pos;
  314. int retval = -EINVAL;
  315. inode = file_inode(file);
  316. if (unlikely((ssize_t) count < 0))
  317. return retval;
  318. pos = *ppos;
  319. if (unlikely(pos < 0)) {
  320. if (!unsigned_offsets(file))
  321. return retval;
  322. if (count >= -pos) /* both values are in 0..LLONG_MAX */
  323. return -EOVERFLOW;
  324. } else if (unlikely((loff_t) (pos + count) < 0)) {
  325. if (!unsigned_offsets(file))
  326. return retval;
  327. }
  328. if (unlikely(inode->i_flock && mandatory_lock(inode))) {
  329. retval = locks_mandatory_area(
  330. read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
  331. inode, file, pos, count);
  332. if (retval < 0)
  333. return retval;
  334. }
  335. retval = security_file_permission(file,
  336. read_write == READ ? MAY_READ : MAY_WRITE);
  337. if (retval)
  338. return retval;
  339. return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
  340. }
  341. ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
  342. {
  343. struct iovec iov = { .iov_base = buf, .iov_len = len };
  344. struct kiocb kiocb;
  345. ssize_t ret;
  346. init_sync_kiocb(&kiocb, filp);
  347. kiocb.ki_pos = *ppos;
  348. kiocb.ki_nbytes = len;
  349. ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
  350. if (-EIOCBQUEUED == ret)
  351. ret = wait_on_sync_kiocb(&kiocb);
  352. *ppos = kiocb.ki_pos;
  353. return ret;
  354. }
  355. EXPORT_SYMBOL(do_sync_read);
  356. ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
  357. {
  358. ssize_t ret;
  359. if (!(file->f_mode & FMODE_READ))
  360. return -EBADF;
  361. if (!file->f_op->read && !file->f_op->aio_read)
  362. return -EINVAL;
  363. if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
  364. return -EFAULT;
  365. ret = rw_verify_area(READ, file, pos, count);
  366. if (ret >= 0) {
  367. count = ret;
  368. if (file->f_op->read)
  369. ret = file->f_op->read(file, buf, count, pos);
  370. else
  371. ret = do_sync_read(file, buf, count, pos);
  372. if (ret > 0) {
  373. fsnotify_access(file);
  374. add_rchar(current, ret);
  375. }
  376. inc_syscr(current);
  377. }
  378. return ret;
  379. }
  380. EXPORT_SYMBOL(vfs_read);
  381. ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  382. {
  383. struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
  384. struct kiocb kiocb;
  385. ssize_t ret;
  386. init_sync_kiocb(&kiocb, filp);
  387. kiocb.ki_pos = *ppos;
  388. kiocb.ki_nbytes = len;
  389. ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
  390. if (-EIOCBQUEUED == ret)
  391. ret = wait_on_sync_kiocb(&kiocb);
  392. *ppos = kiocb.ki_pos;
  393. return ret;
  394. }
  395. EXPORT_SYMBOL(do_sync_write);
  396. ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
  397. {
  398. mm_segment_t old_fs;
  399. const char __user *p;
  400. ssize_t ret;
  401. if (!file->f_op->write && !file->f_op->aio_write)
  402. return -EINVAL;
  403. old_fs = get_fs();
  404. set_fs(get_ds());
  405. p = (__force const char __user *)buf;
  406. if (count > MAX_RW_COUNT)
  407. count = MAX_RW_COUNT;
  408. if (file->f_op->write)
  409. ret = file->f_op->write(file, p, count, pos);
  410. else
  411. ret = do_sync_write(file, p, count, pos);
  412. set_fs(old_fs);
  413. if (ret > 0) {
  414. fsnotify_modify(file);
  415. add_wchar(current, ret);
  416. }
  417. inc_syscw(current);
  418. return ret;
  419. }
  420. ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
  421. {
  422. ssize_t ret;
  423. if (!(file->f_mode & FMODE_WRITE))
  424. return -EBADF;
  425. if (!file->f_op->write && !file->f_op->aio_write)
  426. return -EINVAL;
  427. if (unlikely(!access_ok(VERIFY_READ, buf, count)))
  428. return -EFAULT;
  429. ret = rw_verify_area(WRITE, file, pos, count);
  430. if (ret >= 0) {
  431. count = ret;
  432. file_start_write(file);
  433. if (file->f_op->write)
  434. ret = file->f_op->write(file, buf, count, pos);
  435. else
  436. ret = do_sync_write(file, buf, count, pos);
  437. if (ret > 0) {
  438. fsnotify_modify(file);
  439. add_wchar(current, ret);
  440. }
  441. inc_syscw(current);
  442. file_end_write(file);
  443. }
  444. return ret;
  445. }
  446. EXPORT_SYMBOL(vfs_write);
  447. static inline loff_t file_pos_read(struct file *file)
  448. {
  449. return file->f_pos;
  450. }
  451. static inline void file_pos_write(struct file *file, loff_t pos)
  452. {
  453. file->f_pos = pos;
  454. }
  455. SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
  456. {
  457. struct fd f = fdget_pos(fd);
  458. ssize_t ret = -EBADF;
  459. if (f.file) {
  460. loff_t pos = file_pos_read(f.file);
  461. ret = vfs_read(f.file, buf, count, &pos);
  462. if (ret >= 0)
  463. file_pos_write(f.file, pos);
  464. fdput_pos(f);
  465. }
  466. return ret;
  467. }
  468. SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
  469. size_t, count)
  470. {
  471. struct fd f = fdget_pos(fd);
  472. ssize_t ret = -EBADF;
  473. if (f.file) {
  474. loff_t pos = file_pos_read(f.file);
  475. ret = vfs_write(f.file, buf, count, &pos);
  476. if (ret >= 0)
  477. file_pos_write(f.file, pos);
  478. fdput_pos(f);
  479. }
  480. return ret;
  481. }
  482. SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
  483. size_t, count, loff_t, pos)
  484. {
  485. struct fd f;
  486. ssize_t ret = -EBADF;
  487. if (pos < 0)
  488. return -EINVAL;
  489. f = fdget(fd);
  490. if (f.file) {
  491. ret = -ESPIPE;
  492. if (f.file->f_mode & FMODE_PREAD)
  493. ret = vfs_read(f.file, buf, count, &pos);
  494. fdput(f);
  495. }
  496. return ret;
  497. }
  498. SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
  499. size_t, count, loff_t, pos)
  500. {
  501. struct fd f;
  502. ssize_t ret = -EBADF;
  503. if (pos < 0)
  504. return -EINVAL;
  505. f = fdget(fd);
  506. if (f.file) {
  507. ret = -ESPIPE;
  508. if (f.file->f_mode & FMODE_PWRITE)
  509. ret = vfs_write(f.file, buf, count, &pos);
  510. fdput(f);
  511. }
  512. return ret;
  513. }
  514. /*
  515. * Reduce an iovec's length in-place. Return the resulting number of segments
  516. */
  517. unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to)
  518. {
  519. unsigned long seg = 0;
  520. size_t len = 0;
  521. while (seg < nr_segs) {
  522. seg++;
  523. if (len + iov->iov_len >= to) {
  524. iov->iov_len = to - len;
  525. break;
  526. }
  527. len += iov->iov_len;
  528. iov++;
  529. }
  530. return seg;
  531. }
  532. EXPORT_SYMBOL(iov_shorten);
  533. static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
  534. unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn)
  535. {
  536. struct kiocb kiocb;
  537. ssize_t ret;
  538. init_sync_kiocb(&kiocb, filp);
  539. kiocb.ki_pos = *ppos;
  540. kiocb.ki_nbytes = len;
  541. ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
  542. if (ret == -EIOCBQUEUED)
  543. ret = wait_on_sync_kiocb(&kiocb);
  544. *ppos = kiocb.ki_pos;
  545. return ret;
  546. }
  547. /* Do it by hand, with file-ops */
  548. static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
  549. unsigned long nr_segs, loff_t *ppos, io_fn_t fn)
  550. {
  551. struct iovec *vector = iov;
  552. ssize_t ret = 0;
  553. while (nr_segs > 0) {
  554. void __user *base;
  555. size_t len;
  556. ssize_t nr;
  557. base = vector->iov_base;
  558. len = vector->iov_len;
  559. vector++;
  560. nr_segs--;
  561. nr = fn(filp, base, len, ppos);
  562. if (nr < 0) {
  563. if (!ret)
  564. ret = nr;
  565. break;
  566. }
  567. ret += nr;
  568. if (nr != len)
  569. break;
  570. }
  571. return ret;
  572. }
  573. /* A write operation does a read from user space and vice versa */
  574. #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
  575. ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
  576. unsigned long nr_segs, unsigned long fast_segs,
  577. struct iovec *fast_pointer,
  578. struct iovec **ret_pointer)
  579. {
  580. unsigned long seg;
  581. ssize_t ret;
  582. struct iovec *iov = fast_pointer;
  583. /*
  584. * SuS says "The readv() function *may* fail if the iovcnt argument
  585. * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
  586. * traditionally returned zero for zero segments, so...
  587. */
  588. if (nr_segs == 0) {
  589. ret = 0;
  590. goto out;
  591. }
  592. /*
  593. * First get the "struct iovec" from user memory and
  594. * verify all the pointers
  595. */
  596. if (nr_segs > UIO_MAXIOV) {
  597. ret = -EINVAL;
  598. goto out;
  599. }
  600. if (nr_segs > fast_segs) {
  601. iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
  602. if (iov == NULL) {
  603. ret = -ENOMEM;
  604. goto out;
  605. }
  606. }
  607. if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
  608. ret = -EFAULT;
  609. goto out;
  610. }
  611. /*
  612. * According to the Single Unix Specification we should return EINVAL
  613. * if an element length is < 0 when cast to ssize_t or if the
  614. * total length would overflow the ssize_t return value of the
  615. * system call.
  616. *
  617. * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the
  618. * overflow case.
  619. */
  620. ret = 0;
  621. for (seg = 0; seg < nr_segs; seg++) {
  622. void __user *buf = iov[seg].iov_base;
  623. ssize_t len = (ssize_t)iov[seg].iov_len;
  624. /* see if we we're about to use an invalid len or if
  625. * it's about to overflow ssize_t */
  626. if (len < 0) {
  627. ret = -EINVAL;
  628. goto out;
  629. }
  630. if (type >= 0
  631. && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
  632. ret = -EFAULT;
  633. goto out;
  634. }
  635. if (len > MAX_RW_COUNT - ret) {
  636. len = MAX_RW_COUNT - ret;
  637. iov[seg].iov_len = len;
  638. }
  639. ret += len;
  640. }
  641. out:
  642. *ret_pointer = iov;
  643. return ret;
  644. }
  645. static ssize_t do_readv_writev(int type, struct file *file,
  646. const struct iovec __user * uvector,
  647. unsigned long nr_segs, loff_t *pos)
  648. {
  649. size_t tot_len;
  650. struct iovec iovstack[UIO_FASTIOV];
  651. struct iovec *iov = iovstack;
  652. ssize_t ret;
  653. io_fn_t fn;
  654. iov_fn_t fnv;
  655. ret = rw_copy_check_uvector(type, uvector, nr_segs,
  656. ARRAY_SIZE(iovstack), iovstack, &iov);
  657. if (ret <= 0)
  658. goto out;
  659. tot_len = ret;
  660. ret = rw_verify_area(type, file, pos, tot_len);
  661. if (ret < 0)
  662. goto out;
  663. fnv = NULL;
  664. if (type == READ) {
  665. fn = file->f_op->read;
  666. fnv = file->f_op->aio_read;
  667. } else {
  668. fn = (io_fn_t)file->f_op->write;
  669. fnv = file->f_op->aio_write;
  670. file_start_write(file);
  671. }
  672. if (fnv)
  673. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  674. pos, fnv);
  675. else
  676. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  677. if (type != READ)
  678. file_end_write(file);
  679. out:
  680. if (iov != iovstack)
  681. kfree(iov);
  682. if ((ret + (type == READ)) > 0) {
  683. if (type == READ)
  684. fsnotify_access(file);
  685. else
  686. fsnotify_modify(file);
  687. }
  688. return ret;
  689. }
  690. ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
  691. unsigned long vlen, loff_t *pos)
  692. {
  693. if (!(file->f_mode & FMODE_READ))
  694. return -EBADF;
  695. if (!file->f_op->aio_read && !file->f_op->read)
  696. return -EINVAL;
  697. return do_readv_writev(READ, file, vec, vlen, pos);
  698. }
  699. EXPORT_SYMBOL(vfs_readv);
  700. ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
  701. unsigned long vlen, loff_t *pos)
  702. {
  703. if (!(file->f_mode & FMODE_WRITE))
  704. return -EBADF;
  705. if (!file->f_op->aio_write && !file->f_op->write)
  706. return -EINVAL;
  707. return do_readv_writev(WRITE, file, vec, vlen, pos);
  708. }
  709. EXPORT_SYMBOL(vfs_writev);
  710. SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
  711. unsigned long, vlen)
  712. {
  713. struct fd f = fdget_pos(fd);
  714. ssize_t ret = -EBADF;
  715. if (f.file) {
  716. loff_t pos = file_pos_read(f.file);
  717. ret = vfs_readv(f.file, vec, vlen, &pos);
  718. if (ret >= 0)
  719. file_pos_write(f.file, pos);
  720. fdput_pos(f);
  721. }
  722. if (ret > 0)
  723. add_rchar(current, ret);
  724. inc_syscr(current);
  725. return ret;
  726. }
  727. SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
  728. unsigned long, vlen)
  729. {
  730. struct fd f = fdget_pos(fd);
  731. ssize_t ret = -EBADF;
  732. if (f.file) {
  733. loff_t pos = file_pos_read(f.file);
  734. ret = vfs_writev(f.file, vec, vlen, &pos);
  735. if (ret >= 0)
  736. file_pos_write(f.file, pos);
  737. fdput_pos(f);
  738. }
  739. if (ret > 0)
  740. add_wchar(current, ret);
  741. inc_syscw(current);
  742. return ret;
  743. }
  744. static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
  745. {
  746. #define HALF_LONG_BITS (BITS_PER_LONG / 2)
  747. return (((loff_t)high << HALF_LONG_BITS) << HALF_LONG_BITS) | low;
  748. }
  749. SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
  750. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  751. {
  752. loff_t pos = pos_from_hilo(pos_h, pos_l);
  753. struct fd f;
  754. ssize_t ret = -EBADF;
  755. if (pos < 0)
  756. return -EINVAL;
  757. f = fdget(fd);
  758. if (f.file) {
  759. ret = -ESPIPE;
  760. if (f.file->f_mode & FMODE_PREAD)
  761. ret = vfs_readv(f.file, vec, vlen, &pos);
  762. fdput(f);
  763. }
  764. if (ret > 0)
  765. add_rchar(current, ret);
  766. inc_syscr(current);
  767. return ret;
  768. }
  769. SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
  770. unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
  771. {
  772. loff_t pos = pos_from_hilo(pos_h, pos_l);
  773. struct fd f;
  774. ssize_t ret = -EBADF;
  775. if (pos < 0)
  776. return -EINVAL;
  777. f = fdget(fd);
  778. if (f.file) {
  779. ret = -ESPIPE;
  780. if (f.file->f_mode & FMODE_PWRITE)
  781. ret = vfs_writev(f.file, vec, vlen, &pos);
  782. fdput(f);
  783. }
  784. if (ret > 0)
  785. add_wchar(current, ret);
  786. inc_syscw(current);
  787. return ret;
  788. }
  789. #ifdef CONFIG_COMPAT
  790. static ssize_t compat_do_readv_writev(int type, struct file *file,
  791. const struct compat_iovec __user *uvector,
  792. unsigned long nr_segs, loff_t *pos)
  793. {
  794. compat_ssize_t tot_len;
  795. struct iovec iovstack[UIO_FASTIOV];
  796. struct iovec *iov = iovstack;
  797. ssize_t ret;
  798. io_fn_t fn;
  799. iov_fn_t fnv;
  800. ret = compat_rw_copy_check_uvector(type, uvector, nr_segs,
  801. UIO_FASTIOV, iovstack, &iov);
  802. if (ret <= 0)
  803. goto out;
  804. tot_len = ret;
  805. ret = rw_verify_area(type, file, pos, tot_len);
  806. if (ret < 0)
  807. goto out;
  808. fnv = NULL;
  809. if (type == READ) {
  810. fn = file->f_op->read;
  811. fnv = file->f_op->aio_read;
  812. } else {
  813. fn = (io_fn_t)file->f_op->write;
  814. fnv = file->f_op->aio_write;
  815. file_start_write(file);
  816. }
  817. if (fnv)
  818. ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
  819. pos, fnv);
  820. else
  821. ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
  822. if (type != READ)
  823. file_end_write(file);
  824. out:
  825. if (iov != iovstack)
  826. kfree(iov);
  827. if ((ret + (type == READ)) > 0) {
  828. if (type == READ)
  829. fsnotify_access(file);
  830. else
  831. fsnotify_modify(file);
  832. }
  833. return ret;
  834. }
  835. static size_t compat_readv(struct file *file,
  836. const struct compat_iovec __user *vec,
  837. unsigned long vlen, loff_t *pos)
  838. {
  839. ssize_t ret = -EBADF;
  840. if (!(file->f_mode & FMODE_READ))
  841. goto out;
  842. ret = -EINVAL;
  843. if (!file->f_op->aio_read && !file->f_op->read)
  844. goto out;
  845. ret = compat_do_readv_writev(READ, file, vec, vlen, pos);
  846. out:
  847. if (ret > 0)
  848. add_rchar(current, ret);
  849. inc_syscr(current);
  850. return ret;
  851. }
  852. COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd,
  853. const struct compat_iovec __user *,vec,
  854. compat_ulong_t, vlen)
  855. {
  856. struct fd f = fdget_pos(fd);
  857. ssize_t ret;
  858. loff_t pos;
  859. if (!f.file)
  860. return -EBADF;
  861. pos = f.file->f_pos;
  862. ret = compat_readv(f.file, vec, vlen, &pos);
  863. if (ret >= 0)
  864. f.file->f_pos = pos;
  865. fdput_pos(f);
  866. return ret;
  867. }
  868. static long __compat_sys_preadv64(unsigned long fd,
  869. const struct compat_iovec __user *vec,
  870. unsigned long vlen, loff_t pos)
  871. {
  872. struct fd f;
  873. ssize_t ret;
  874. if (pos < 0)
  875. return -EINVAL;
  876. f = fdget(fd);
  877. if (!f.file)
  878. return -EBADF;
  879. ret = -ESPIPE;
  880. if (f.file->f_mode & FMODE_PREAD)
  881. ret = compat_readv(f.file, vec, vlen, &pos);
  882. fdput(f);
  883. return ret;
  884. }
  885. #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64
  886. COMPAT_SYSCALL_DEFINE4(preadv64, unsigned long, fd,
  887. const struct compat_iovec __user *,vec,
  888. unsigned long, vlen, loff_t, pos)
  889. {
  890. return __compat_sys_preadv64(fd, vec, vlen, pos);
  891. }
  892. #endif
  893. COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd,
  894. const struct compat_iovec __user *,vec,
  895. compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  896. {
  897. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  898. return __compat_sys_preadv64(fd, vec, vlen, pos);
  899. }
  900. static size_t compat_writev(struct file *file,
  901. const struct compat_iovec __user *vec,
  902. unsigned long vlen, loff_t *pos)
  903. {
  904. ssize_t ret = -EBADF;
  905. if (!(file->f_mode & FMODE_WRITE))
  906. goto out;
  907. ret = -EINVAL;
  908. if (!file->f_op->aio_write && !file->f_op->write)
  909. goto out;
  910. ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos);
  911. out:
  912. if (ret > 0)
  913. add_wchar(current, ret);
  914. inc_syscw(current);
  915. return ret;
  916. }
  917. COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd,
  918. const struct compat_iovec __user *, vec,
  919. compat_ulong_t, vlen)
  920. {
  921. struct fd f = fdget_pos(fd);
  922. ssize_t ret;
  923. loff_t pos;
  924. if (!f.file)
  925. return -EBADF;
  926. pos = f.file->f_pos;
  927. ret = compat_writev(f.file, vec, vlen, &pos);
  928. if (ret >= 0)
  929. f.file->f_pos = pos;
  930. fdput_pos(f);
  931. return ret;
  932. }
  933. static long __compat_sys_pwritev64(unsigned long fd,
  934. const struct compat_iovec __user *vec,
  935. unsigned long vlen, loff_t pos)
  936. {
  937. struct fd f;
  938. ssize_t ret;
  939. if (pos < 0)
  940. return -EINVAL;
  941. f = fdget(fd);
  942. if (!f.file)
  943. return -EBADF;
  944. ret = -ESPIPE;
  945. if (f.file->f_mode & FMODE_PWRITE)
  946. ret = compat_writev(f.file, vec, vlen, &pos);
  947. fdput(f);
  948. return ret;
  949. }
  950. #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64
  951. COMPAT_SYSCALL_DEFINE4(pwritev64, unsigned long, fd,
  952. const struct compat_iovec __user *,vec,
  953. unsigned long, vlen, loff_t, pos)
  954. {
  955. return __compat_sys_pwritev64(fd, vec, vlen, pos);
  956. }
  957. #endif
  958. COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd,
  959. const struct compat_iovec __user *,vec,
  960. compat_ulong_t, vlen, u32, pos_low, u32, pos_high)
  961. {
  962. loff_t pos = ((loff_t)pos_high << 32) | pos_low;
  963. return __compat_sys_pwritev64(fd, vec, vlen, pos);
  964. }
  965. #endif
  966. static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
  967. size_t count, loff_t max)
  968. {
  969. struct fd in, out;
  970. struct inode *in_inode, *out_inode;
  971. loff_t pos;
  972. loff_t out_pos;
  973. ssize_t retval;
  974. int fl;
  975. /*
  976. * Get input file, and verify that it is ok..
  977. */
  978. retval = -EBADF;
  979. in = fdget(in_fd);
  980. if (!in.file)
  981. goto out;
  982. if (!(in.file->f_mode & FMODE_READ))
  983. goto fput_in;
  984. retval = -ESPIPE;
  985. if (!ppos) {
  986. pos = in.file->f_pos;
  987. } else {
  988. pos = *ppos;
  989. if (!(in.file->f_mode & FMODE_PREAD))
  990. goto fput_in;
  991. }
  992. retval = rw_verify_area(READ, in.file, &pos, count);
  993. if (retval < 0)
  994. goto fput_in;
  995. count = retval;
  996. /*
  997. * Get output file, and verify that it is ok..
  998. */
  999. retval = -EBADF;
  1000. out = fdget(out_fd);
  1001. if (!out.file)
  1002. goto fput_in;
  1003. if (!(out.file->f_mode & FMODE_WRITE))
  1004. goto fput_out;
  1005. retval = -EINVAL;
  1006. in_inode = file_inode(in.file);
  1007. out_inode = file_inode(out.file);
  1008. out_pos = out.file->f_pos;
  1009. retval = rw_verify_area(WRITE, out.file, &out_pos, count);
  1010. if (retval < 0)
  1011. goto fput_out;
  1012. count = retval;
  1013. if (!max)
  1014. max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
  1015. if (unlikely(pos + count > max)) {
  1016. retval = -EOVERFLOW;
  1017. if (pos >= max)
  1018. goto fput_out;
  1019. count = max - pos;
  1020. }
  1021. fl = 0;
  1022. #if 0
  1023. /*
  1024. * We need to debate whether we can enable this or not. The
  1025. * man page documents EAGAIN return for the output at least,
  1026. * and the application is arguably buggy if it doesn't expect
  1027. * EAGAIN on a non-blocking file descriptor.
  1028. */
  1029. if (in.file->f_flags & O_NONBLOCK)
  1030. fl = SPLICE_F_NONBLOCK;
  1031. #endif
  1032. file_start_write(out.file);
  1033. retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl);
  1034. file_end_write(out.file);
  1035. if (retval > 0) {
  1036. add_rchar(current, retval);
  1037. add_wchar(current, retval);
  1038. fsnotify_access(in.file);
  1039. fsnotify_modify(out.file);
  1040. out.file->f_pos = out_pos;
  1041. if (ppos)
  1042. *ppos = pos;
  1043. else
  1044. in.file->f_pos = pos;
  1045. }
  1046. inc_syscr(current);
  1047. inc_syscw(current);
  1048. if (pos > max)
  1049. retval = -EOVERFLOW;
  1050. fput_out:
  1051. fdput(out);
  1052. fput_in:
  1053. fdput(in);
  1054. out:
  1055. return retval;
  1056. }
  1057. SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd, off_t __user *, offset, size_t, count)
  1058. {
  1059. loff_t pos;
  1060. off_t off;
  1061. ssize_t ret;
  1062. if (offset) {
  1063. if (unlikely(get_user(off, offset)))
  1064. return -EFAULT;
  1065. pos = off;
  1066. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1067. if (unlikely(put_user(pos, offset)))
  1068. return -EFAULT;
  1069. return ret;
  1070. }
  1071. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1072. }
  1073. SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, size_t, count)
  1074. {
  1075. loff_t pos;
  1076. ssize_t ret;
  1077. if (offset) {
  1078. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1079. return -EFAULT;
  1080. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1081. if (unlikely(put_user(pos, offset)))
  1082. return -EFAULT;
  1083. return ret;
  1084. }
  1085. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1086. }
  1087. #ifdef CONFIG_COMPAT
  1088. COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
  1089. compat_off_t __user *, offset, compat_size_t, count)
  1090. {
  1091. loff_t pos;
  1092. off_t off;
  1093. ssize_t ret;
  1094. if (offset) {
  1095. if (unlikely(get_user(off, offset)))
  1096. return -EFAULT;
  1097. pos = off;
  1098. ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
  1099. if (unlikely(put_user(pos, offset)))
  1100. return -EFAULT;
  1101. return ret;
  1102. }
  1103. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1104. }
  1105. COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
  1106. compat_loff_t __user *, offset, compat_size_t, count)
  1107. {
  1108. loff_t pos;
  1109. ssize_t ret;
  1110. if (offset) {
  1111. if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
  1112. return -EFAULT;
  1113. ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
  1114. if (unlikely(put_user(pos, offset)))
  1115. return -EFAULT;
  1116. return ret;
  1117. }
  1118. return do_sendfile(out_fd, in_fd, NULL, count, 0);
  1119. }
  1120. #endif