file.c 26 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019
  1. /*
  2. * (C) 2001 Clemson University and The University of Chicago
  3. *
  4. * See COPYING in top-level directory.
  5. */
  6. /*
  7. * Linux VFS file operations.
  8. */
  9. #include "protocol.h"
  10. #include "pvfs2-kernel.h"
  11. #include "pvfs2-bufmap.h"
  12. #include <linux/fs.h>
  13. #include <linux/pagemap.h>
  14. #define wake_up_daemon_for_return(op) \
  15. do { \
  16. spin_lock(&op->lock); \
  17. op->io_completed = 1; \
  18. spin_unlock(&op->lock); \
  19. wake_up_interruptible(&op->io_completion_waitq);\
  20. } while (0)
  21. /*
  22. * Copy to client-core's address space from the buffers specified
  23. * by the iovec upto total_size bytes.
  24. * NOTE: the iovector can either contain addresses which
  25. * can futher be kernel-space or user-space addresses.
  26. * or it can pointers to struct page's
  27. */
  28. static int precopy_buffers(struct pvfs2_bufmap *bufmap,
  29. int buffer_index,
  30. const struct iovec *vec,
  31. unsigned long nr_segs,
  32. size_t total_size,
  33. int from_user)
  34. {
  35. int ret = 0;
  36. /*
  37. * copy data from application/kernel by pulling it out
  38. * of the iovec.
  39. */
  40. /* Are we copying from User Virtual Addresses? */
  41. if (from_user)
  42. ret = pvfs_bufmap_copy_iovec_from_user(
  43. bufmap,
  44. buffer_index,
  45. vec,
  46. nr_segs,
  47. total_size);
  48. /* Are we copying from Kernel Virtual Addresses? */
  49. else
  50. ret = pvfs_bufmap_copy_iovec_from_kernel(
  51. bufmap,
  52. buffer_index,
  53. vec,
  54. nr_segs,
  55. total_size);
  56. if (ret < 0)
  57. gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n",
  58. __func__,
  59. (long)ret);
  60. return ret;
  61. }
  62. /*
  63. * Copy from client-core's address space to the buffers specified
  64. * by the iovec upto total_size bytes.
  65. * NOTE: the iovector can either contain addresses which
  66. * can futher be kernel-space or user-space addresses.
  67. * or it can pointers to struct page's
  68. */
  69. static int postcopy_buffers(struct pvfs2_bufmap *bufmap,
  70. int buffer_index,
  71. const struct iovec *vec,
  72. int nr_segs,
  73. size_t total_size,
  74. int to_user)
  75. {
  76. int ret = 0;
  77. /*
  78. * copy data to application/kernel by pushing it out to
  79. * the iovec. NOTE; target buffers can be addresses or
  80. * struct page pointers.
  81. */
  82. if (total_size) {
  83. /* Are we copying to User Virtual Addresses? */
  84. if (to_user)
  85. ret = pvfs_bufmap_copy_to_user_iovec(
  86. bufmap,
  87. buffer_index,
  88. vec,
  89. nr_segs,
  90. total_size);
  91. /* Are we copying to Kern Virtual Addresses? */
  92. else
  93. ret = pvfs_bufmap_copy_to_kernel_iovec(
  94. bufmap,
  95. buffer_index,
  96. vec,
  97. nr_segs,
  98. total_size);
  99. if (ret < 0)
  100. gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n",
  101. __func__,
  102. (long)ret);
  103. }
  104. return ret;
  105. }
  106. /*
  107. * Post and wait for the I/O upcall to finish
  108. */
  109. static ssize_t wait_for_direct_io(enum PVFS_io_type type, struct inode *inode,
  110. loff_t *offset, struct iovec *vec, unsigned long nr_segs,
  111. size_t total_size, loff_t readahead_size, int to_user)
  112. {
  113. struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
  114. struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
  115. struct pvfs2_bufmap *bufmap = NULL;
  116. struct pvfs2_kernel_op_s *new_op = NULL;
  117. int buffer_index = -1;
  118. ssize_t ret;
  119. new_op = op_alloc(PVFS2_VFS_OP_FILE_IO);
  120. if (!new_op) {
  121. ret = -ENOMEM;
  122. goto out;
  123. }
  124. /* synchronous I/O */
  125. new_op->upcall.req.io.async_vfs_io = PVFS_VFS_SYNC_IO;
  126. new_op->upcall.req.io.readahead_size = readahead_size;
  127. new_op->upcall.req.io.io_type = type;
  128. new_op->upcall.req.io.refn = pvfs2_inode->refn;
  129. populate_shared_memory:
  130. /* get a shared buffer index */
  131. ret = pvfs_bufmap_get(&bufmap, &buffer_index);
  132. if (ret < 0) {
  133. gossip_debug(GOSSIP_FILE_DEBUG,
  134. "%s: pvfs_bufmap_get failure (%ld)\n",
  135. __func__, (long)ret);
  136. goto out;
  137. }
  138. gossip_debug(GOSSIP_FILE_DEBUG,
  139. "%s(%pU): GET op %p -> buffer_index %d\n",
  140. __func__,
  141. handle,
  142. new_op,
  143. buffer_index);
  144. new_op->uses_shared_memory = 1;
  145. new_op->upcall.req.io.buf_index = buffer_index;
  146. new_op->upcall.req.io.count = total_size;
  147. new_op->upcall.req.io.offset = *offset;
  148. gossip_debug(GOSSIP_FILE_DEBUG,
  149. "%s(%pU): copy_to_user %d nr_segs %lu, offset: %llu total_size: %zd\n",
  150. __func__,
  151. handle,
  152. to_user,
  153. nr_segs,
  154. llu(*offset),
  155. total_size);
  156. /*
  157. * Stage 1: copy the buffers into client-core's address space
  158. * precopy_buffers only pertains to writes.
  159. */
  160. if (type == PVFS_IO_WRITE) {
  161. ret = precopy_buffers(bufmap,
  162. buffer_index,
  163. vec,
  164. nr_segs,
  165. total_size,
  166. to_user);
  167. if (ret < 0)
  168. goto out;
  169. }
  170. gossip_debug(GOSSIP_FILE_DEBUG,
  171. "%s(%pU): Calling post_io_request with tag (%llu)\n",
  172. __func__,
  173. handle,
  174. llu(new_op->tag));
  175. /* Stage 2: Service the I/O operation */
  176. ret = service_operation(new_op,
  177. type == PVFS_IO_WRITE ?
  178. "file_write" :
  179. "file_read",
  180. get_interruptible_flag(inode));
  181. /*
  182. * If service_operation() returns -EAGAIN #and# the operation was
  183. * purged from pvfs2_request_list or htable_ops_in_progress, then
  184. * we know that the client was restarted, causing the shared memory
  185. * area to be wiped clean. To restart a write operation in this
  186. * case, we must re-copy the data from the user's iovec to a NEW
  187. * shared memory location. To restart a read operation, we must get
  188. * a new shared memory location.
  189. */
  190. if (ret == -EAGAIN && op_state_purged(new_op)) {
  191. pvfs_bufmap_put(bufmap, buffer_index);
  192. gossip_debug(GOSSIP_FILE_DEBUG,
  193. "%s:going to repopulate_shared_memory.\n",
  194. __func__);
  195. goto populate_shared_memory;
  196. }
  197. if (ret < 0) {
  198. handle_io_error(); /* defined in pvfs2-kernel.h */
  199. /*
  200. don't write an error to syslog on signaled operation
  201. termination unless we've got debugging turned on, as
  202. this can happen regularly (i.e. ctrl-c)
  203. */
  204. if (ret == -EINTR)
  205. gossip_debug(GOSSIP_FILE_DEBUG,
  206. "%s: returning error %ld\n", __func__,
  207. (long)ret);
  208. else
  209. gossip_err("%s: error in %s handle %pU, returning %zd\n",
  210. __func__,
  211. type == PVFS_IO_READ ?
  212. "read from" : "write to",
  213. handle, ret);
  214. goto out;
  215. }
  216. /*
  217. * Stage 3: Post copy buffers from client-core's address space
  218. * postcopy_buffers only pertains to reads.
  219. */
  220. if (type == PVFS_IO_READ) {
  221. ret = postcopy_buffers(bufmap,
  222. buffer_index,
  223. vec,
  224. nr_segs,
  225. new_op->downcall.resp.io.amt_complete,
  226. to_user);
  227. if (ret < 0) {
  228. /*
  229. * put error codes in downcall so that handle_io_error()
  230. * preserves it properly
  231. */
  232. new_op->downcall.status = ret;
  233. handle_io_error();
  234. goto out;
  235. }
  236. }
  237. gossip_debug(GOSSIP_FILE_DEBUG,
  238. "%s(%pU): Amount written as returned by the sys-io call:%d\n",
  239. __func__,
  240. handle,
  241. (int)new_op->downcall.resp.io.amt_complete);
  242. ret = new_op->downcall.resp.io.amt_complete;
  243. /*
  244. tell the device file owner waiting on I/O that this read has
  245. completed and it can return now. in this exact case, on
  246. wakeup the daemon will free the op, so we *cannot* touch it
  247. after this.
  248. */
  249. wake_up_daemon_for_return(new_op);
  250. new_op = NULL;
  251. out:
  252. if (buffer_index >= 0) {
  253. pvfs_bufmap_put(bufmap, buffer_index);
  254. gossip_debug(GOSSIP_FILE_DEBUG,
  255. "%s(%pU): PUT buffer_index %d\n",
  256. __func__, handle, buffer_index);
  257. buffer_index = -1;
  258. }
  259. if (new_op) {
  260. op_release(new_op);
  261. new_op = NULL;
  262. }
  263. return ret;
  264. }
  265. /*
  266. * The reason we need to do this is to be able to support readv and writev
  267. * that are larger than (pvfs_bufmap_size_query()) Default is
  268. * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
  269. * create a new io vec descriptor for those memory addresses that
  270. * go beyond the limit. Return value for this routine is negative in case
  271. * of errors and 0 in case of success.
  272. *
  273. * Further, the new_nr_segs pointer is updated to hold the new value
  274. * of number of iovecs, the new_vec pointer is updated to hold the pointer
  275. * to the new split iovec, and the size array is an array of integers holding
  276. * the number of iovecs that straddle pvfs_bufmap_size_query().
  277. * The max_new_nr_segs value is computed by the caller and returned.
  278. * (It will be (count of all iov_len/ block_size) + 1).
  279. */
  280. static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
  281. unsigned long nr_segs, /* IN */
  282. const struct iovec *original_iovec, /* IN */
  283. unsigned long *new_nr_segs, /* OUT */
  284. struct iovec **new_vec, /* OUT */
  285. unsigned long *seg_count, /* OUT */
  286. unsigned long **seg_array) /* OUT */
  287. {
  288. unsigned long seg;
  289. unsigned long count = 0;
  290. unsigned long begin_seg;
  291. unsigned long tmpnew_nr_segs = 0;
  292. struct iovec *new_iovec = NULL;
  293. struct iovec *orig_iovec;
  294. unsigned long *sizes = NULL;
  295. unsigned long sizes_count = 0;
  296. if (nr_segs <= 0 ||
  297. original_iovec == NULL ||
  298. new_nr_segs == NULL ||
  299. new_vec == NULL ||
  300. seg_count == NULL ||
  301. seg_array == NULL ||
  302. max_new_nr_segs <= 0) {
  303. gossip_err("Invalid parameters to split_iovecs\n");
  304. return -EINVAL;
  305. }
  306. *new_nr_segs = 0;
  307. *new_vec = NULL;
  308. *seg_count = 0;
  309. *seg_array = NULL;
  310. /* copy the passed in iovec descriptor to a temp structure */
  311. orig_iovec = kmalloc_array(nr_segs,
  312. sizeof(*orig_iovec),
  313. PVFS2_BUFMAP_GFP_FLAGS);
  314. if (orig_iovec == NULL) {
  315. gossip_err(
  316. "split_iovecs: Could not allocate memory for %lu bytes!\n",
  317. (unsigned long)(nr_segs * sizeof(*orig_iovec)));
  318. return -ENOMEM;
  319. }
  320. new_iovec = kcalloc(max_new_nr_segs,
  321. sizeof(*new_iovec),
  322. PVFS2_BUFMAP_GFP_FLAGS);
  323. if (new_iovec == NULL) {
  324. kfree(orig_iovec);
  325. gossip_err(
  326. "split_iovecs: Could not allocate memory for %lu bytes!\n",
  327. (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
  328. return -ENOMEM;
  329. }
  330. sizes = kcalloc(max_new_nr_segs,
  331. sizeof(*sizes),
  332. PVFS2_BUFMAP_GFP_FLAGS);
  333. if (sizes == NULL) {
  334. kfree(new_iovec);
  335. kfree(orig_iovec);
  336. gossip_err(
  337. "split_iovecs: Could not allocate memory for %lu bytes!\n",
  338. (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
  339. return -ENOMEM;
  340. }
  341. /* copy the passed in iovec to a temp structure */
  342. memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
  343. begin_seg = 0;
  344. repeat:
  345. for (seg = begin_seg; seg < nr_segs; seg++) {
  346. if (tmpnew_nr_segs >= max_new_nr_segs ||
  347. sizes_count >= max_new_nr_segs) {
  348. kfree(sizes);
  349. kfree(orig_iovec);
  350. kfree(new_iovec);
  351. gossip_err
  352. ("split_iovecs: exceeded the index limit (%lu)\n",
  353. tmpnew_nr_segs);
  354. return -EINVAL;
  355. }
  356. if (count + orig_iovec[seg].iov_len <
  357. pvfs_bufmap_size_query()) {
  358. count += orig_iovec[seg].iov_len;
  359. memcpy(&new_iovec[tmpnew_nr_segs],
  360. &orig_iovec[seg],
  361. sizeof(*new_iovec));
  362. tmpnew_nr_segs++;
  363. sizes[sizes_count]++;
  364. } else {
  365. new_iovec[tmpnew_nr_segs].iov_base =
  366. orig_iovec[seg].iov_base;
  367. new_iovec[tmpnew_nr_segs].iov_len =
  368. (pvfs_bufmap_size_query() - count);
  369. tmpnew_nr_segs++;
  370. sizes[sizes_count]++;
  371. sizes_count++;
  372. begin_seg = seg;
  373. orig_iovec[seg].iov_base +=
  374. (pvfs_bufmap_size_query() - count);
  375. orig_iovec[seg].iov_len -=
  376. (pvfs_bufmap_size_query() - count);
  377. count = 0;
  378. break;
  379. }
  380. }
  381. if (seg != nr_segs)
  382. goto repeat;
  383. else
  384. sizes_count++;
  385. *new_nr_segs = tmpnew_nr_segs;
  386. /* new_iovec is freed by the caller */
  387. *new_vec = new_iovec;
  388. *seg_count = sizes_count;
  389. /* seg_array is also freed by the caller */
  390. *seg_array = sizes;
  391. kfree(orig_iovec);
  392. return 0;
  393. }
  394. static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
  395. ssize_t *total_count)
  396. {
  397. unsigned long i;
  398. long max_nr_iovecs;
  399. ssize_t total;
  400. ssize_t count;
  401. total = 0;
  402. count = 0;
  403. max_nr_iovecs = 0;
  404. for (i = 0; i < nr_segs; i++) {
  405. const struct iovec *iv = &curr[i];
  406. count += iv->iov_len;
  407. if (unlikely((ssize_t) (count | iv->iov_len) < 0))
  408. return -EINVAL;
  409. if (total + iv->iov_len < pvfs_bufmap_size_query()) {
  410. total += iv->iov_len;
  411. max_nr_iovecs++;
  412. } else {
  413. total =
  414. (total + iv->iov_len - pvfs_bufmap_size_query());
  415. max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
  416. }
  417. }
  418. *total_count = count;
  419. return max_nr_iovecs;
  420. }
  421. /*
  422. * Common entry point for read/write/readv/writev
  423. * This function will dispatch it to either the direct I/O
  424. * or buffered I/O path depending on the mount options and/or
  425. * augmented/extended metadata attached to the file.
  426. * Note: File extended attributes override any mount options.
  427. */
  428. static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
  429. loff_t *offset, const struct iovec *iov, unsigned long nr_segs)
  430. {
  431. struct inode *inode = file->f_mapping->host;
  432. struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
  433. struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
  434. ssize_t ret;
  435. ssize_t total_count;
  436. unsigned int to_free;
  437. size_t count;
  438. unsigned long seg;
  439. unsigned long new_nr_segs = 0;
  440. unsigned long max_new_nr_segs = 0;
  441. unsigned long seg_count = 0;
  442. unsigned long *seg_array = NULL;
  443. struct iovec *iovecptr = NULL;
  444. struct iovec *ptr = NULL;
  445. total_count = 0;
  446. ret = -EINVAL;
  447. count = 0;
  448. to_free = 0;
  449. /* Compute total and max number of segments after split */
  450. max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
  451. if (max_new_nr_segs < 0) {
  452. gossip_lerr("%s: could not bound iovec %lu\n",
  453. __func__,
  454. max_new_nr_segs);
  455. goto out;
  456. }
  457. gossip_debug(GOSSIP_FILE_DEBUG,
  458. "%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
  459. __func__,
  460. handle,
  461. (int)count);
  462. if (type == PVFS_IO_WRITE) {
  463. gossip_debug(GOSSIP_FILE_DEBUG,
  464. "%s(%pU): proceeding with offset : %llu, "
  465. "size %d\n",
  466. __func__,
  467. handle,
  468. llu(*offset),
  469. (int)count);
  470. }
  471. if (count == 0) {
  472. ret = 0;
  473. goto out;
  474. }
  475. /*
  476. * if the total size of data transfer requested is greater than
  477. * the kernel-set blocksize of PVFS2, then we split the iovecs
  478. * such that no iovec description straddles a block size limit
  479. */
  480. gossip_debug(GOSSIP_FILE_DEBUG,
  481. "%s: pvfs_bufmap_size:%d\n",
  482. __func__,
  483. pvfs_bufmap_size_query());
  484. if (count > pvfs_bufmap_size_query()) {
  485. /*
  486. * Split up the given iovec description such that
  487. * no iovec descriptor straddles over the block-size limitation.
  488. * This makes us our job easier to stage the I/O.
  489. * In addition, this function will also compute an array
  490. * with seg_count entries that will store the number of
  491. * segments that straddle the block-size boundaries.
  492. */
  493. ret = split_iovecs(max_new_nr_segs, /* IN */
  494. nr_segs, /* IN */
  495. iov, /* IN */
  496. &new_nr_segs, /* OUT */
  497. &iovecptr, /* OUT */
  498. &seg_count, /* OUT */
  499. &seg_array); /* OUT */
  500. if (ret < 0) {
  501. gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
  502. __func__,
  503. ret);
  504. goto out;
  505. }
  506. gossip_debug(GOSSIP_FILE_DEBUG,
  507. "%s: Splitting iovecs from %lu to %lu"
  508. " [max_new %lu]\n",
  509. __func__,
  510. nr_segs,
  511. new_nr_segs,
  512. max_new_nr_segs);
  513. /* We must free seg_array and iovecptr */
  514. to_free = 1;
  515. } else {
  516. new_nr_segs = nr_segs;
  517. /* use the given iovec description */
  518. iovecptr = (struct iovec *)iov;
  519. /* There is only 1 element in the seg_array */
  520. seg_count = 1;
  521. /* and its value is the number of segments passed in */
  522. seg_array = &nr_segs;
  523. /* We dont have to free up anything */
  524. to_free = 0;
  525. }
  526. ptr = iovecptr;
  527. gossip_debug(GOSSIP_FILE_DEBUG,
  528. "%s(%pU) %zd@%llu\n",
  529. __func__,
  530. handle,
  531. count,
  532. llu(*offset));
  533. gossip_debug(GOSSIP_FILE_DEBUG,
  534. "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
  535. __func__,
  536. handle,
  537. new_nr_segs, seg_count);
  538. /* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
  539. #ifdef PVFS2_KERNEL_DEBUG
  540. for (seg = 0; seg < new_nr_segs; seg++)
  541. gossip_debug(GOSSIP_FILE_DEBUG,
  542. "%s: %d) %p to %p [%d bytes]\n",
  543. __func__,
  544. (int)seg + 1,
  545. iovecptr[seg].iov_base,
  546. iovecptr[seg].iov_base + iovecptr[seg].iov_len,
  547. (int)iovecptr[seg].iov_len);
  548. for (seg = 0; seg < seg_count; seg++)
  549. gossip_debug(GOSSIP_FILE_DEBUG,
  550. "%s: %zd) %lu\n",
  551. __func__,
  552. seg + 1,
  553. seg_array[seg]);
  554. #endif
  555. seg = 0;
  556. while (total_count < count) {
  557. size_t each_count;
  558. size_t amt_complete;
  559. /* how much to transfer in this loop iteration */
  560. each_count =
  561. (((count - total_count) > pvfs_bufmap_size_query()) ?
  562. pvfs_bufmap_size_query() :
  563. (count - total_count));
  564. gossip_debug(GOSSIP_FILE_DEBUG,
  565. "%s(%pU): size of each_count(%d)\n",
  566. __func__,
  567. handle,
  568. (int)each_count);
  569. gossip_debug(GOSSIP_FILE_DEBUG,
  570. "%s(%pU): BEFORE wait_for_io: offset is %d\n",
  571. __func__,
  572. handle,
  573. (int)*offset);
  574. ret = wait_for_direct_io(type, inode, offset, ptr,
  575. seg_array[seg], each_count, 0, 1);
  576. gossip_debug(GOSSIP_FILE_DEBUG,
  577. "%s(%pU): return from wait_for_io:%d\n",
  578. __func__,
  579. handle,
  580. (int)ret);
  581. if (ret < 0)
  582. goto out;
  583. /* advance the iovec pointer */
  584. ptr += seg_array[seg];
  585. seg++;
  586. *offset += ret;
  587. total_count += ret;
  588. amt_complete = ret;
  589. gossip_debug(GOSSIP_FILE_DEBUG,
  590. "%s(%pU): AFTER wait_for_io: offset is %d\n",
  591. __func__,
  592. handle,
  593. (int)*offset);
  594. /*
  595. * if we got a short I/O operations,
  596. * fall out and return what we got so far
  597. */
  598. if (amt_complete < each_count)
  599. break;
  600. } /*end while */
  601. if (total_count > 0)
  602. ret = total_count;
  603. out:
  604. if (to_free) {
  605. kfree(iovecptr);
  606. kfree(seg_array);
  607. }
  608. if (ret > 0) {
  609. if (type == PVFS_IO_READ) {
  610. file_accessed(file);
  611. } else {
  612. SetMtimeFlag(pvfs2_inode);
  613. inode->i_mtime = CURRENT_TIME;
  614. mark_inode_dirty_sync(inode);
  615. }
  616. }
  617. gossip_debug(GOSSIP_FILE_DEBUG,
  618. "%s(%pU): Value(%d) returned.\n",
  619. __func__,
  620. handle,
  621. (int)ret);
  622. return ret;
  623. }
  624. /*
  625. * Read data from a specified offset in a file (referenced by inode).
  626. * Data may be placed either in a user or kernel buffer.
  627. */
  628. ssize_t pvfs2_inode_read(struct inode *inode,
  629. char __user *buf,
  630. size_t count,
  631. loff_t *offset,
  632. loff_t readahead_size)
  633. {
  634. struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
  635. size_t bufmap_size;
  636. struct iovec vec;
  637. ssize_t ret = -EINVAL;
  638. g_pvfs2_stats.reads++;
  639. vec.iov_base = buf;
  640. vec.iov_len = count;
  641. bufmap_size = pvfs_bufmap_size_query();
  642. if (count > bufmap_size) {
  643. gossip_debug(GOSSIP_FILE_DEBUG,
  644. "%s: count is too large (%zd/%zd)!\n",
  645. __func__, count, bufmap_size);
  646. return -EINVAL;
  647. }
  648. gossip_debug(GOSSIP_FILE_DEBUG,
  649. "%s(%pU) %zd@%llu\n",
  650. __func__,
  651. &pvfs2_inode->refn.khandle,
  652. count,
  653. llu(*offset));
  654. ret = wait_for_direct_io(PVFS_IO_READ, inode, offset, &vec, 1,
  655. count, readahead_size, 0);
  656. if (ret > 0)
  657. *offset += ret;
  658. gossip_debug(GOSSIP_FILE_DEBUG,
  659. "%s(%pU): Value(%zd) returned.\n",
  660. __func__,
  661. &pvfs2_inode->refn.khandle,
  662. ret);
  663. return ret;
  664. }
  665. static ssize_t pvfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
  666. {
  667. struct file *file = iocb->ki_filp;
  668. loff_t pos = *(&iocb->ki_pos);
  669. ssize_t rc = 0;
  670. unsigned long nr_segs = iter->nr_segs;
  671. BUG_ON(iocb->private);
  672. gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_read_iter\n");
  673. g_pvfs2_stats.reads++;
  674. rc = do_readv_writev(PVFS_IO_READ,
  675. file,
  676. &pos,
  677. iter->iov,
  678. nr_segs);
  679. iocb->ki_pos = pos;
  680. return rc;
  681. }
  682. static ssize_t pvfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
  683. {
  684. struct file *file = iocb->ki_filp;
  685. loff_t pos = *(&iocb->ki_pos);
  686. unsigned long nr_segs = iter->nr_segs;
  687. ssize_t rc;
  688. BUG_ON(iocb->private);
  689. gossip_debug(GOSSIP_FILE_DEBUG, "pvfs2_file_write_iter\n");
  690. mutex_lock(&file->f_mapping->host->i_mutex);
  691. /* Make sure generic_write_checks sees an up to date inode size. */
  692. if (file->f_flags & O_APPEND) {
  693. rc = pvfs2_inode_getattr(file->f_mapping->host,
  694. PVFS_ATTR_SYS_SIZE);
  695. if (rc) {
  696. gossip_err("%s: pvfs2_inode_getattr failed, rc:%zd:.\n",
  697. __func__, rc);
  698. goto out;
  699. }
  700. }
  701. if (file->f_pos > i_size_read(file->f_mapping->host))
  702. pvfs2_i_size_write(file->f_mapping->host, file->f_pos);
  703. rc = generic_write_checks(iocb, iter);
  704. if (rc <= 0) {
  705. gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
  706. __func__, rc);
  707. goto out;
  708. }
  709. rc = do_readv_writev(PVFS_IO_WRITE,
  710. file,
  711. &pos,
  712. iter->iov,
  713. nr_segs);
  714. if (rc < 0) {
  715. gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
  716. __func__, rc);
  717. goto out;
  718. }
  719. iocb->ki_pos = pos;
  720. g_pvfs2_stats.writes++;
  721. out:
  722. mutex_unlock(&file->f_mapping->host->i_mutex);
  723. return rc;
  724. }
  725. /*
  726. * Perform a miscellaneous operation on a file.
  727. */
  728. long pvfs2_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
  729. {
  730. int ret = -ENOTTY;
  731. __u64 val = 0;
  732. unsigned long uval;
  733. gossip_debug(GOSSIP_FILE_DEBUG,
  734. "pvfs2_ioctl: called with cmd %d\n",
  735. cmd);
  736. /*
  737. * we understand some general ioctls on files, such as the immutable
  738. * and append flags
  739. */
  740. if (cmd == FS_IOC_GETFLAGS) {
  741. val = 0;
  742. ret = pvfs2_xattr_get_default(file->f_path.dentry,
  743. "user.pvfs2.meta_hint",
  744. &val,
  745. sizeof(val),
  746. 0);
  747. if (ret < 0 && ret != -ENODATA)
  748. return ret;
  749. else if (ret == -ENODATA)
  750. val = 0;
  751. uval = val;
  752. gossip_debug(GOSSIP_FILE_DEBUG,
  753. "pvfs2_ioctl: FS_IOC_GETFLAGS: %llu\n",
  754. (unsigned long long)uval);
  755. return put_user(uval, (int __user *)arg);
  756. } else if (cmd == FS_IOC_SETFLAGS) {
  757. ret = 0;
  758. if (get_user(uval, (int __user *)arg))
  759. return -EFAULT;
  760. /*
  761. * PVFS_MIRROR_FL is set internally when the mirroring mode
  762. * is turned on for a file. The user is not allowed to turn
  763. * on this bit, but the bit is present if the user first gets
  764. * the flags and then updates the flags with some new
  765. * settings. So, we ignore it in the following edit. bligon.
  766. */
  767. if ((uval & ~PVFS_MIRROR_FL) &
  768. (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) {
  769. gossip_err("pvfs2_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n");
  770. return -EINVAL;
  771. }
  772. val = uval;
  773. gossip_debug(GOSSIP_FILE_DEBUG,
  774. "pvfs2_ioctl: FS_IOC_SETFLAGS: %llu\n",
  775. (unsigned long long)val);
  776. ret = pvfs2_xattr_set_default(file->f_path.dentry,
  777. "user.pvfs2.meta_hint",
  778. &val,
  779. sizeof(val),
  780. 0,
  781. 0);
  782. }
  783. return ret;
  784. }
  785. /*
  786. * Memory map a region of a file.
  787. */
  788. static int pvfs2_file_mmap(struct file *file, struct vm_area_struct *vma)
  789. {
  790. gossip_debug(GOSSIP_FILE_DEBUG,
  791. "pvfs2_file_mmap: called on %s\n",
  792. (file ?
  793. (char *)file->f_path.dentry->d_name.name :
  794. (char *)"Unknown"));
  795. /* set the sequential readahead hint */
  796. vma->vm_flags |= VM_SEQ_READ;
  797. vma->vm_flags &= ~VM_RAND_READ;
  798. return generic_file_mmap(file, vma);
  799. }
  800. #define mapping_nrpages(idata) ((idata)->nrpages)
  801. /*
  802. * Called to notify the module that there are no more references to
  803. * this file (i.e. no processes have it open).
  804. *
  805. * \note Not called when each file is closed.
  806. */
  807. int pvfs2_file_release(struct inode *inode, struct file *file)
  808. {
  809. gossip_debug(GOSSIP_FILE_DEBUG,
  810. "pvfs2_file_release: called on %s\n",
  811. file->f_path.dentry->d_name.name);
  812. pvfs2_flush_inode(inode);
  813. /*
  814. remove all associated inode pages from the page cache and mmap
  815. readahead cache (if any); this forces an expensive refresh of
  816. data for the next caller of mmap (or 'get_block' accesses)
  817. */
  818. if (file->f_path.dentry->d_inode &&
  819. file->f_path.dentry->d_inode->i_mapping &&
  820. mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
  821. truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
  822. 0);
  823. return 0;
  824. }
  825. /*
  826. * Push all data for a specific file onto permanent storage.
  827. */
  828. int pvfs2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
  829. {
  830. int ret = -EINVAL;
  831. struct pvfs2_inode_s *pvfs2_inode =
  832. PVFS2_I(file->f_path.dentry->d_inode);
  833. struct pvfs2_kernel_op_s *new_op = NULL;
  834. /* required call */
  835. filemap_write_and_wait_range(file->f_mapping, start, end);
  836. new_op = op_alloc(PVFS2_VFS_OP_FSYNC);
  837. if (!new_op)
  838. return -ENOMEM;
  839. new_op->upcall.req.fsync.refn = pvfs2_inode->refn;
  840. ret = service_operation(new_op,
  841. "pvfs2_fsync",
  842. get_interruptible_flag(file->f_path.dentry->d_inode));
  843. gossip_debug(GOSSIP_FILE_DEBUG,
  844. "pvfs2_fsync got return value of %d\n",
  845. ret);
  846. op_release(new_op);
  847. pvfs2_flush_inode(file->f_path.dentry->d_inode);
  848. return ret;
  849. }
  850. /*
  851. * Change the file pointer position for an instance of an open file.
  852. *
  853. * \note If .llseek is overriden, we must acquire lock as described in
  854. * Documentation/filesystems/Locking.
  855. *
  856. * Future upgrade could support SEEK_DATA and SEEK_HOLE but would
  857. * require much changes to the FS
  858. */
  859. loff_t pvfs2_file_llseek(struct file *file, loff_t offset, int origin)
  860. {
  861. int ret = -EINVAL;
  862. struct inode *inode = file->f_path.dentry->d_inode;
  863. if (!inode) {
  864. gossip_err("pvfs2_file_llseek: invalid inode (NULL)\n");
  865. return ret;
  866. }
  867. if (origin == PVFS2_SEEK_END) {
  868. /*
  869. * revalidate the inode's file size.
  870. * NOTE: We are only interested in file size here,
  871. * so we set mask accordingly.
  872. */
  873. ret = pvfs2_inode_getattr(inode, PVFS_ATTR_SYS_SIZE);
  874. if (ret) {
  875. gossip_debug(GOSSIP_FILE_DEBUG,
  876. "%s:%s:%d calling make bad inode\n",
  877. __FILE__,
  878. __func__,
  879. __LINE__);
  880. pvfs2_make_bad_inode(inode);
  881. return ret;
  882. }
  883. }
  884. gossip_debug(GOSSIP_FILE_DEBUG,
  885. "pvfs2_file_llseek: offset is %ld | origin is %d | "
  886. "inode size is %lu\n",
  887. (long)offset,
  888. origin,
  889. (unsigned long)file->f_path.dentry->d_inode->i_size);
  890. return generic_file_llseek(file, offset, origin);
  891. }
  892. /*
  893. * Support local locks (locks that only this kernel knows about)
  894. * if Orangefs was mounted -o local_lock.
  895. */
  896. int pvfs2_lock(struct file *filp, int cmd, struct file_lock *fl)
  897. {
  898. int rc = -ENOLCK;
  899. if (PVFS2_SB(filp->f_inode->i_sb)->flags & PVFS2_OPT_LOCAL_LOCK) {
  900. if (cmd == F_GETLK) {
  901. rc = 0;
  902. posix_test_lock(filp, fl);
  903. } else {
  904. rc = posix_lock_file(filp, fl, NULL);
  905. }
  906. }
  907. return rc;
  908. }
  909. /** PVFS2 implementation of VFS file operations */
  910. const struct file_operations pvfs2_file_operations = {
  911. .llseek = pvfs2_file_llseek,
  912. .read_iter = pvfs2_file_read_iter,
  913. .write_iter = pvfs2_file_write_iter,
  914. .lock = pvfs2_lock,
  915. .unlocked_ioctl = pvfs2_ioctl,
  916. .mmap = pvfs2_file_mmap,
  917. .open = generic_file_open,
  918. .release = pvfs2_file_release,
  919. .fsync = pvfs2_fsync,
  920. };