|
@@ -256,168 +256,6 @@ out:
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * The reason we need to do this is to be able to support readv and writev
|
|
|
- * that are larger than (pvfs_bufmap_size_query()) Default is
|
|
|
- * PVFS2_BUFMAP_DEFAULT_DESC_SIZE MB. What that means is that we will
|
|
|
- * create a new io vec descriptor for those memory addresses that
|
|
|
- * go beyond the limit. Return value for this routine is negative in case
|
|
|
- * of errors and 0 in case of success.
|
|
|
- *
|
|
|
- * Further, the new_nr_segs pointer is updated to hold the new value
|
|
|
- * of number of iovecs, the new_vec pointer is updated to hold the pointer
|
|
|
- * to the new split iovec, and the size array is an array of integers holding
|
|
|
- * the number of iovecs that straddle pvfs_bufmap_size_query().
|
|
|
- * The max_new_nr_segs value is computed by the caller and returned.
|
|
|
- * (It will be (count of all iov_len/ block_size) + 1).
|
|
|
- */
|
|
|
-static int split_iovecs(unsigned long max_new_nr_segs, /* IN */
|
|
|
- unsigned long nr_segs, /* IN */
|
|
|
- const struct iovec *original_iovec, /* IN */
|
|
|
- unsigned long *new_nr_segs, /* OUT */
|
|
|
- struct iovec **new_vec, /* OUT */
|
|
|
- unsigned long *seg_count, /* OUT */
|
|
|
- unsigned long **seg_array) /* OUT */
|
|
|
-{
|
|
|
- unsigned long seg;
|
|
|
- unsigned long count = 0;
|
|
|
- unsigned long begin_seg;
|
|
|
- unsigned long tmpnew_nr_segs = 0;
|
|
|
- struct iovec *new_iovec = NULL;
|
|
|
- struct iovec *orig_iovec;
|
|
|
- unsigned long *sizes = NULL;
|
|
|
- unsigned long sizes_count = 0;
|
|
|
-
|
|
|
- if (nr_segs <= 0 ||
|
|
|
- original_iovec == NULL ||
|
|
|
- new_nr_segs == NULL ||
|
|
|
- new_vec == NULL ||
|
|
|
- seg_count == NULL ||
|
|
|
- seg_array == NULL ||
|
|
|
- max_new_nr_segs <= 0) {
|
|
|
- gossip_err("Invalid parameters to split_iovecs\n");
|
|
|
- return -EINVAL;
|
|
|
- }
|
|
|
- *new_nr_segs = 0;
|
|
|
- *new_vec = NULL;
|
|
|
- *seg_count = 0;
|
|
|
- *seg_array = NULL;
|
|
|
- /* copy the passed in iovec descriptor to a temp structure */
|
|
|
- orig_iovec = kmalloc_array(nr_segs,
|
|
|
- sizeof(*orig_iovec),
|
|
|
- PVFS2_BUFMAP_GFP_FLAGS);
|
|
|
- if (orig_iovec == NULL) {
|
|
|
- gossip_err(
|
|
|
- "split_iovecs: Could not allocate memory for %lu bytes!\n",
|
|
|
- (unsigned long)(nr_segs * sizeof(*orig_iovec)));
|
|
|
- return -ENOMEM;
|
|
|
- }
|
|
|
- new_iovec = kcalloc(max_new_nr_segs,
|
|
|
- sizeof(*new_iovec),
|
|
|
- PVFS2_BUFMAP_GFP_FLAGS);
|
|
|
- if (new_iovec == NULL) {
|
|
|
- kfree(orig_iovec);
|
|
|
- gossip_err(
|
|
|
- "split_iovecs: Could not allocate memory for %lu bytes!\n",
|
|
|
- (unsigned long)(max_new_nr_segs * sizeof(*new_iovec)));
|
|
|
- return -ENOMEM;
|
|
|
- }
|
|
|
- sizes = kcalloc(max_new_nr_segs,
|
|
|
- sizeof(*sizes),
|
|
|
- PVFS2_BUFMAP_GFP_FLAGS);
|
|
|
- if (sizes == NULL) {
|
|
|
- kfree(new_iovec);
|
|
|
- kfree(orig_iovec);
|
|
|
- gossip_err(
|
|
|
- "split_iovecs: Could not allocate memory for %lu bytes!\n",
|
|
|
- (unsigned long)(max_new_nr_segs * sizeof(*sizes)));
|
|
|
- return -ENOMEM;
|
|
|
- }
|
|
|
- /* copy the passed in iovec to a temp structure */
|
|
|
- memcpy(orig_iovec, original_iovec, nr_segs * sizeof(*orig_iovec));
|
|
|
- begin_seg = 0;
|
|
|
-repeat:
|
|
|
- for (seg = begin_seg; seg < nr_segs; seg++) {
|
|
|
- if (tmpnew_nr_segs >= max_new_nr_segs ||
|
|
|
- sizes_count >= max_new_nr_segs) {
|
|
|
- kfree(sizes);
|
|
|
- kfree(orig_iovec);
|
|
|
- kfree(new_iovec);
|
|
|
- gossip_err
|
|
|
- ("split_iovecs: exceeded the index limit (%lu)\n",
|
|
|
- tmpnew_nr_segs);
|
|
|
- return -EINVAL;
|
|
|
- }
|
|
|
- if (count + orig_iovec[seg].iov_len <
|
|
|
- pvfs_bufmap_size_query()) {
|
|
|
- count += orig_iovec[seg].iov_len;
|
|
|
- memcpy(&new_iovec[tmpnew_nr_segs],
|
|
|
- &orig_iovec[seg],
|
|
|
- sizeof(*new_iovec));
|
|
|
- tmpnew_nr_segs++;
|
|
|
- sizes[sizes_count]++;
|
|
|
- } else {
|
|
|
- new_iovec[tmpnew_nr_segs].iov_base =
|
|
|
- orig_iovec[seg].iov_base;
|
|
|
- new_iovec[tmpnew_nr_segs].iov_len =
|
|
|
- (pvfs_bufmap_size_query() - count);
|
|
|
- tmpnew_nr_segs++;
|
|
|
- sizes[sizes_count]++;
|
|
|
- sizes_count++;
|
|
|
- begin_seg = seg;
|
|
|
- orig_iovec[seg].iov_base +=
|
|
|
- (pvfs_bufmap_size_query() - count);
|
|
|
- orig_iovec[seg].iov_len -=
|
|
|
- (pvfs_bufmap_size_query() - count);
|
|
|
- count = 0;
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- if (seg != nr_segs)
|
|
|
- goto repeat;
|
|
|
- else
|
|
|
- sizes_count++;
|
|
|
-
|
|
|
- *new_nr_segs = tmpnew_nr_segs;
|
|
|
- /* new_iovec is freed by the caller */
|
|
|
- *new_vec = new_iovec;
|
|
|
- *seg_count = sizes_count;
|
|
|
- /* seg_array is also freed by the caller */
|
|
|
- *seg_array = sizes;
|
|
|
- kfree(orig_iovec);
|
|
|
- return 0;
|
|
|
-}
|
|
|
-
|
|
|
-static long bound_max_iovecs(const struct iovec *curr, unsigned long nr_segs,
|
|
|
- ssize_t *total_count)
|
|
|
-{
|
|
|
- unsigned long i;
|
|
|
- long max_nr_iovecs;
|
|
|
- ssize_t total;
|
|
|
- ssize_t count;
|
|
|
-
|
|
|
- total = 0;
|
|
|
- count = 0;
|
|
|
- max_nr_iovecs = 0;
|
|
|
- for (i = 0; i < nr_segs; i++) {
|
|
|
- const struct iovec *iv = &curr[i];
|
|
|
-
|
|
|
- count += iv->iov_len;
|
|
|
- if (unlikely((ssize_t) (count | iv->iov_len) < 0))
|
|
|
- return -EINVAL;
|
|
|
- if (total + iv->iov_len < pvfs_bufmap_size_query()) {
|
|
|
- total += iv->iov_len;
|
|
|
- max_nr_iovecs++;
|
|
|
- } else {
|
|
|
- total =
|
|
|
- (total + iv->iov_len - pvfs_bufmap_size_query());
|
|
|
- max_nr_iovecs += (total / pvfs_bufmap_size_query() + 2);
|
|
|
- }
|
|
|
- }
|
|
|
- *total_count = count;
|
|
|
- return max_nr_iovecs;
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* Common entry point for read/write/readv/writev
|
|
|
* This function will dispatch it to either the direct I/O
|
|
@@ -431,25 +269,10 @@ static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
|
|
|
struct inode *inode = file->f_mapping->host;
|
|
|
struct pvfs2_inode_s *pvfs2_inode = PVFS2_I(inode);
|
|
|
struct pvfs2_khandle *handle = &pvfs2_inode->refn.khandle;
|
|
|
- ssize_t ret;
|
|
|
- ssize_t total_count;
|
|
|
- unsigned int to_free;
|
|
|
- size_t count;
|
|
|
- unsigned long seg;
|
|
|
- unsigned long new_nr_segs;
|
|
|
- unsigned long max_new_nr_segs;
|
|
|
- unsigned long seg_count;
|
|
|
- unsigned long *seg_array;
|
|
|
- struct iovec *iovecptr;
|
|
|
- struct iovec *ptr;
|
|
|
-
|
|
|
- total_count = 0;
|
|
|
- ret = -EINVAL;
|
|
|
- count = 0;
|
|
|
- to_free = 0;
|
|
|
-
|
|
|
- /* Compute total and max number of segments after split */
|
|
|
- max_new_nr_segs = bound_max_iovecs(iov, nr_segs, &count);
|
|
|
+ struct iov_iter iter;
|
|
|
+ size_t count = iov_length(iov, nr_segs);
|
|
|
+ ssize_t total_count = 0;
|
|
|
+ ssize_t ret = -EINVAL;
|
|
|
|
|
|
gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
"%s-BEGIN(%pU): count(%d) after estimate_max_iovecs.\n",
|
|
@@ -472,93 +295,10 @@ static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
|
|
|
goto out;
|
|
|
}
|
|
|
|
|
|
- /*
|
|
|
- * if the total size of data transfer requested is greater than
|
|
|
- * the kernel-set blocksize of PVFS2, then we split the iovecs
|
|
|
- * such that no iovec description straddles a block size limit
|
|
|
- */
|
|
|
-
|
|
|
- gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
- "%s: pvfs_bufmap_size:%d\n",
|
|
|
- __func__,
|
|
|
- pvfs_bufmap_size_query());
|
|
|
-
|
|
|
- if (count > pvfs_bufmap_size_query()) {
|
|
|
- /*
|
|
|
- * Split up the given iovec description such that
|
|
|
- * no iovec descriptor straddles over the block-size limitation.
|
|
|
- * This makes us our job easier to stage the I/O.
|
|
|
- * In addition, this function will also compute an array
|
|
|
- * with seg_count entries that will store the number of
|
|
|
- * segments that straddle the block-size boundaries.
|
|
|
- */
|
|
|
- ret = split_iovecs(max_new_nr_segs, /* IN */
|
|
|
- nr_segs, /* IN */
|
|
|
- iov, /* IN */
|
|
|
- &new_nr_segs, /* OUT */
|
|
|
- &iovecptr, /* OUT */
|
|
|
- &seg_count, /* OUT */
|
|
|
- &seg_array); /* OUT */
|
|
|
- if (ret < 0) {
|
|
|
- gossip_err("%s: Failed to split iovecs to satisfy larger than blocksize readv/writev request %zd\n",
|
|
|
- __func__,
|
|
|
- ret);
|
|
|
- goto out;
|
|
|
- }
|
|
|
- gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
- "%s: Splitting iovecs from %lu to %lu"
|
|
|
- " [max_new %lu]\n",
|
|
|
- __func__,
|
|
|
- nr_segs,
|
|
|
- new_nr_segs,
|
|
|
- max_new_nr_segs);
|
|
|
- /* We must free seg_array and iovecptr */
|
|
|
- to_free = 1;
|
|
|
- } else {
|
|
|
- new_nr_segs = nr_segs;
|
|
|
- /* use the given iovec description */
|
|
|
- iovecptr = (struct iovec *)iov;
|
|
|
- /* There is only 1 element in the seg_array */
|
|
|
- seg_count = 1;
|
|
|
- /* and its value is the number of segments passed in */
|
|
|
- seg_array = &nr_segs;
|
|
|
- /* We dont have to free up anything */
|
|
|
- to_free = 0;
|
|
|
- }
|
|
|
- ptr = iovecptr;
|
|
|
+ iov_iter_init(&iter, type == PVFS_IO_READ ? READ : WRITE,
|
|
|
+ iov, nr_segs, count);
|
|
|
|
|
|
- gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
- "%s(%pU) %zd@%llu\n",
|
|
|
- __func__,
|
|
|
- handle,
|
|
|
- count,
|
|
|
- llu(*offset));
|
|
|
- gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
- "%s(%pU): new_nr_segs: %lu, seg_count: %lu\n",
|
|
|
- __func__,
|
|
|
- handle,
|
|
|
- new_nr_segs, seg_count);
|
|
|
-
|
|
|
-/* PVFS2_KERNEL_DEBUG is a CFLAGS define. */
|
|
|
-#ifdef PVFS2_KERNEL_DEBUG
|
|
|
- for (seg = 0; seg < new_nr_segs; seg++)
|
|
|
- gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
- "%s: %d) %p to %p [%d bytes]\n",
|
|
|
- __func__,
|
|
|
- (int)seg + 1,
|
|
|
- iovecptr[seg].iov_base,
|
|
|
- iovecptr[seg].iov_base + iovecptr[seg].iov_len,
|
|
|
- (int)iovecptr[seg].iov_len);
|
|
|
- for (seg = 0; seg < seg_count; seg++)
|
|
|
- gossip_debug(GOSSIP_FILE_DEBUG,
|
|
|
- "%s: %zd) %lu\n",
|
|
|
- __func__,
|
|
|
- seg + 1,
|
|
|
- seg_array[seg]);
|
|
|
-#endif
|
|
|
- seg = 0;
|
|
|
while (total_count < count) {
|
|
|
- struct iov_iter iter;
|
|
|
size_t each_count;
|
|
|
size_t amt_complete;
|
|
|
|
|
@@ -579,9 +319,6 @@ static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
|
|
|
handle,
|
|
|
(int)*offset);
|
|
|
|
|
|
- iov_iter_init(&iter, type == PVFS_IO_READ ? READ : WRITE,
|
|
|
- ptr, seg_array[seg], each_count);
|
|
|
-
|
|
|
ret = wait_for_direct_io(type, inode, offset, &iter,
|
|
|
each_count, 0);
|
|
|
gossip_debug(GOSSIP_FILE_DEBUG,
|
|
@@ -593,9 +330,6 @@ static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
|
|
|
if (ret < 0)
|
|
|
goto out;
|
|
|
|
|
|
- /* advance the iovec pointer */
|
|
|
- ptr += seg_array[seg];
|
|
|
- seg++;
|
|
|
*offset += ret;
|
|
|
total_count += ret;
|
|
|
amt_complete = ret;
|
|
@@ -617,10 +351,6 @@ static ssize_t do_readv_writev(enum PVFS_io_type type, struct file *file,
|
|
|
if (total_count > 0)
|
|
|
ret = total_count;
|
|
|
out:
|
|
|
- if (to_free) {
|
|
|
- kfree(iovecptr);
|
|
|
- kfree(seg_array);
|
|
|
- }
|
|
|
if (ret > 0) {
|
|
|
if (type == PVFS_IO_READ) {
|
|
|
file_accessed(file);
|