Browse Source

net/rpmsg: add support to handle a remote processor error recovery

The rpmsg_proto driver is used to provide a socket interface to
userspace under the AF_RPMSG address family, and is used by the
TI IPC MessageQ stack. The rpmsg proto driver uses a single rpmsg
channel device published from a remote processor to transmit and
receive all socket-based messages to/from that remote processor.
There can be any number of Tx and Rx sockets associated with each
remote processor's rpmsg device. This rpmsg channel device will be
auto-removed and recreated if the associated remote processor goes
through an error recovery process. Any existing open sockets (both
Tx and Rx) are oblivious if the underlying rpmsg channel has been
removed, and any further operations on such sockets can create
various kernel crashes due to invalid pointer dereferences.

This patch adds the error recovery support to the rpmsg-proto driver.
This is achieved by using the private field of the published rpmsg
channel device's endpoint (ept->priv) to maintain a list of all the
connected and bound sockets, and setting a new error status
(RPMSG_ERROR) on all these open sockets when the associated parent
rpmsg device is removed. This new error status allows the driver
to check for a valid state of a socket before performing any actions
on it thereby preventing any kernel crashes. The status is also used
to allow the userspace to perform appropriate cleanup and/or recovery
steps.

The logic is asymmetric because of the slight difference between the
Rx and Tx sockets. All the Tx sockets use the one-time published
rpmsg_channel devices for transmissions and just need to be marked
with the error status, while each of the Rx sockets have their own
derivative rpmsg endpoints, and so need to be removed alongside the
removal of the associated rpmsg channel device in addition. The
sockets themselves are freed up anytime either by the userspace
closing them or through an automatic close when the process is
terminated/closed.

Signed-off-by: Suman Anna <s-anna@ti.com>
Suman Anna 8 years ago
parent
commit
3f8abde36d
1 changed files with 83 additions and 13 deletions
  1. 83 13
      net/rpmsg/rpmsg_proto.c

+ 83 - 13
net/rpmsg/rpmsg_proto.c

@@ -39,6 +39,7 @@ struct rpmsg_socket {
 	struct rpmsg_device *rpdev;
 	struct rpmsg_endpoint *endpt;
 	int rproc_id;
+	struct list_head elem;
 };
 
 /* Connection and socket states */
@@ -47,6 +48,7 @@ enum {
 	RPMSG_OPEN,
 	RPMSG_LISTENING,
 	RPMSG_CLOSED,
+	RPMSG_ERROR,
 };
 
 /* A single-level radix-tree-based scheme is used to maintain the rpmsg
@@ -125,12 +127,11 @@ static int rpmsg_sock_connect(struct socket *sock, struct sockaddr *addr,
 
 	sa = (struct sockaddr_rpmsg *)addr;
 
+	mutex_lock(&rpmsg_channels_lock);
 	lock_sock(sk);
 
 	rpsk = container_of(sk, struct rpmsg_socket, sk);
 
-	mutex_lock(&rpmsg_channels_lock);
-
 	/* find the set of channels exposed by this remote processor */
 	rpdev = radix_tree_lookup(&rpmsg_channels, sa->vproc_id);
 	if (!rpdev) {
@@ -141,12 +142,15 @@ static int rpmsg_sock_connect(struct socket *sock, struct sockaddr *addr,
 	rpsk->rproc_id = sa->vproc_id;
 	rpsk->rpdev = rpdev;
 
+	/* bind this socket with its parent rpmsg device */
+	list_add_tail(&rpsk->elem, rpdev->ept->priv);
+
 	/* XXX take care of disconnection state too */
 	sk->sk_state = RPMSG_CONNECTED;
 
 out:
-	mutex_unlock(&rpmsg_channels_lock);
 	release_sock(sk);
+	mutex_unlock(&rpmsg_channels_lock);
 	return err;
 }
 
@@ -326,17 +330,29 @@ static int rpmsg_sock_release(struct socket *sock)
 {
 	struct sock *sk = sock->sk;
 	struct rpmsg_socket *rpsk = container_of(sk, struct rpmsg_socket, sk);
+	struct rpmsg_endpoint *endpt;
 
 	if (!sk)
 		return 0;
 
-	/* function can be called with NULL endpoints, so it is effective for
-	 * Rx sockets and a no-op for Tx sockets
-	 */
-	rpmsg_destroy_ept(rpsk->endpt);
+	if (sk->sk_state == RPMSG_OPEN)
+		goto out;
 
-	sock_put(sock->sk);
+	lock_sock(sk);
+	if (sk->sk_state != RPMSG_ERROR) {
+		rpsk->rpdev = NULL;
+		list_del(&rpsk->elem);
+		endpt = rpsk->endpt;
+		rpsk->endpt = NULL;
+		release_sock(sk);
+		if (endpt)
+			rpmsg_destroy_ept(endpt);
+		goto out;
+	}
+	release_sock(sk);
 
+out:
+	sock_put(sock->sk);
 	return 0;
 }
 
@@ -355,6 +371,7 @@ rpmsg_sock_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct rpmsg_endpoint *endpt;
 	struct rpmsg_channel_info chinfo = {};
 	struct sockaddr_rpmsg *sa = (struct sockaddr_rpmsg *)uaddr;
+	int ret = 0;
 
 	if (sock->state == SS_CONNECTED)
 		return -EINVAL;
@@ -371,24 +388,37 @@ rpmsg_sock_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	if (sk->sk_state != RPMSG_OPEN)
 		return -EINVAL;
 
+	mutex_lock(&rpmsg_channels_lock);
+
 	rpdev = radix_tree_lookup(&rpmsg_channels, sa->vproc_id);
-	if (!rpdev)
-		return -EINVAL;
+	if (!rpdev) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	/* bind this socket with a receiving endpoint */
 	chinfo.src = sa->addr;
 	chinfo.dst = RPMSG_ADDR_ANY;
 	endpt = rpmsg_create_ept(rpdev, rpmsg_sock_cb, sk, chinfo);
-	if (!endpt)
-		return -EINVAL;
+	if (!endpt) {
+		ret = -EINVAL;
+		goto out;
+	}
 
+	lock_sock(sk);
 	rpsk->rpdev = rpdev;
 	rpsk->endpt = endpt;
 	rpsk->rproc_id = sa->vproc_id;
 
+	/* bind this socket with its parent rpmsg device */
+	list_add_tail(&rpsk->elem, rpdev->ept->priv);
+
 	sk->sk_state = RPMSG_LISTENING;
+	release_sock(sk);
 
-	return 0;
+out:
+	mutex_unlock(&rpmsg_channels_lock);
+	return ret;
 }
 
 static const struct proto_ops rpmsg_sock_ops = {
@@ -442,6 +472,7 @@ static int rpmsg_sock_create(struct net *net, struct socket *sock, int proto,
 	sk->sk_state = RPMSG_OPEN;
 
 	rpsk = container_of(sk, struct rpmsg_socket, sk);
+	INIT_LIST_HEAD(&rpsk->elem);
 	/* use RPMSG_LOCALHOST to serve as an invalid value */
 	rpsk->rproc_id = RPMSG_LOCALHOST;
 
@@ -529,6 +560,7 @@ static int rpmsg_proto_probe(struct rpmsg_device *rpdev)
 	struct device *dev = &rpdev->dev;
 	int ret, dst = rpdev->dst, id;
 	struct rpmsg_device *vrp_dev;
+	struct list_head *sock_list = NULL;
 
 	if (WARN_ON(dst == RPMSG_ADDR_ANY))
 		return -EINVAL;
@@ -556,6 +588,20 @@ static int rpmsg_proto_probe(struct rpmsg_device *rpdev)
 		goto out;
 	}
 
+	/* reuse the rpdev endpoint's private field for storing the list of
+	 * all connected and bound sockets on this rpmsg device.
+	 */
+	WARN_ON(!!rpdev->ept->priv);
+	sock_list = kzalloc(sizeof(*sock_list), GFP_KERNEL);
+	if (!sock_list) {
+		dev_err(dev, "failed to allocate list_head\n");
+		radix_tree_delete(&rpmsg_channels, id);
+		ret = -ENOMEM;
+		goto out;
+	}
+	INIT_LIST_HEAD(sock_list);
+	rpdev->ept->priv = sock_list;
+
 out:
 	mutex_unlock(&rpmsg_channels_lock);
 
@@ -567,6 +613,9 @@ static void rpmsg_proto_remove(struct rpmsg_device *rpdev)
 	struct device *dev = &rpdev->dev;
 	int id, dst = rpdev->dst;
 	struct rpmsg_device *vrp_dev;
+	struct list_head *sk_list;
+	struct rpmsg_socket *rpsk, *tmp;
+	struct rpmsg_endpoint *endpt = NULL;
 
 	if (dst == RPMSG_ADDR_ANY)
 		return;
@@ -586,6 +635,27 @@ static void rpmsg_proto_remove(struct rpmsg_device *rpdev)
 	if (!radix_tree_delete(&rpmsg_channels, id))
 		dev_err(dev, "failed to delete rpdev for rproc %d\n", id);
 
+	/* mark all associated sockets invalid and remove them from the
+	 * rpdev's list. Destroy the endpoints for bound sockets as the
+	 * parent rpdev will not exist until the socket's release()
+	 */
+	sk_list = rpdev->ept->priv;
+	list_for_each_entry_safe(rpsk, tmp, sk_list, elem) {
+		lock_sock(&rpsk->sk);
+		if (rpsk->rpdev) {
+			rpsk->rpdev = NULL;
+			rpsk->sk.sk_state = RPMSG_ERROR;
+			list_del(&rpsk->elem);
+			endpt = rpsk->endpt;
+			rpsk->endpt = NULL;
+		}
+		release_sock(&rpsk->sk);
+		if (endpt)
+			rpmsg_destroy_ept(endpt);
+	}
+	kfree(sk_list);
+	rpdev->ept->priv = NULL;
+
 out:
 	mutex_unlock(&rpmsg_channels_lock);
 }