|
@@ -60,8 +60,59 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
|
|
|
const struct nlattr *attr) __read_mostly;
|
|
|
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
|
|
|
|
|
|
-DEFINE_SPINLOCK(nf_conntrack_lock);
|
|
|
-EXPORT_SYMBOL_GPL(nf_conntrack_lock);
|
|
|
+__cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
|
|
|
+EXPORT_SYMBOL_GPL(nf_conntrack_locks);
|
|
|
+
|
|
|
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
|
|
|
+EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
|
|
|
+
|
|
|
+static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
|
|
|
+{
|
|
|
+ h1 %= CONNTRACK_LOCKS;
|
|
|
+ h2 %= CONNTRACK_LOCKS;
|
|
|
+ spin_unlock(&nf_conntrack_locks[h1]);
|
|
|
+ if (h1 != h2)
|
|
|
+ spin_unlock(&nf_conntrack_locks[h2]);
|
|
|
+}
|
|
|
+
|
|
|
+/* return true if we need to recompute hashes (in case hash table was resized) */
|
|
|
+static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
|
|
|
+ unsigned int h2, unsigned int sequence)
|
|
|
+{
|
|
|
+ h1 %= CONNTRACK_LOCKS;
|
|
|
+ h2 %= CONNTRACK_LOCKS;
|
|
|
+ if (h1 <= h2) {
|
|
|
+ spin_lock(&nf_conntrack_locks[h1]);
|
|
|
+ if (h1 != h2)
|
|
|
+ spin_lock_nested(&nf_conntrack_locks[h2],
|
|
|
+ SINGLE_DEPTH_NESTING);
|
|
|
+ } else {
|
|
|
+ spin_lock(&nf_conntrack_locks[h2]);
|
|
|
+ spin_lock_nested(&nf_conntrack_locks[h1],
|
|
|
+ SINGLE_DEPTH_NESTING);
|
|
|
+ }
|
|
|
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
|
|
|
+ nf_conntrack_double_unlock(h1, h2);
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return false;
|
|
|
+}
|
|
|
+
|
|
|
+static void nf_conntrack_all_lock(void)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
|
|
|
+ spin_lock_nested(&nf_conntrack_locks[i], i);
|
|
|
+}
|
|
|
+
|
|
|
+static void nf_conntrack_all_unlock(void)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < CONNTRACK_LOCKS; i++)
|
|
|
+ spin_unlock(&nf_conntrack_locks[i]);
|
|
|
+}
|
|
|
|
|
|
unsigned int nf_conntrack_htable_size __read_mostly;
|
|
|
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
|
|
@@ -192,6 +243,50 @@ clean_from_lists(struct nf_conn *ct)
|
|
|
nf_ct_remove_expectations(ct);
|
|
|
}
|
|
|
|
|
|
+/* must be called with local_bh_disable */
|
|
|
+static void nf_ct_add_to_dying_list(struct nf_conn *ct)
|
|
|
+{
|
|
|
+ struct ct_pcpu *pcpu;
|
|
|
+
|
|
|
+ /* add this conntrack to the (per cpu) dying list */
|
|
|
+ ct->cpu = smp_processor_id();
|
|
|
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
|
|
|
+
|
|
|
+ spin_lock(&pcpu->lock);
|
|
|
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
|
|
|
+ &pcpu->dying);
|
|
|
+ spin_unlock(&pcpu->lock);
|
|
|
+}
|
|
|
+
|
|
|
+/* must be called with local_bh_disable */
|
|
|
+static void nf_ct_add_to_unconfirmed_list(struct nf_conn *ct)
|
|
|
+{
|
|
|
+ struct ct_pcpu *pcpu;
|
|
|
+
|
|
|
+ /* add this conntrack to the (per cpu) unconfirmed list */
|
|
|
+ ct->cpu = smp_processor_id();
|
|
|
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
|
|
|
+
|
|
|
+ spin_lock(&pcpu->lock);
|
|
|
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
|
|
|
+ &pcpu->unconfirmed);
|
|
|
+ spin_unlock(&pcpu->lock);
|
|
|
+}
|
|
|
+
|
|
|
+/* must be called with local_bh_disable */
|
|
|
+static void nf_ct_del_from_dying_or_unconfirmed_list(struct nf_conn *ct)
|
|
|
+{
|
|
|
+ struct ct_pcpu *pcpu;
|
|
|
+
|
|
|
+ /* We overload first tuple to link into unconfirmed or dying list.*/
|
|
|
+ pcpu = per_cpu_ptr(nf_ct_net(ct)->ct.pcpu_lists, ct->cpu);
|
|
|
+
|
|
|
+ spin_lock(&pcpu->lock);
|
|
|
+ BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
|
|
|
+ hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
|
|
|
+ spin_unlock(&pcpu->lock);
|
|
|
+}
|
|
|
+
|
|
|
static void
|
|
|
destroy_conntrack(struct nf_conntrack *nfct)
|
|
|
{
|
|
@@ -203,9 +298,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
|
|
|
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
|
|
|
NF_CT_ASSERT(!timer_pending(&ct->timeout));
|
|
|
|
|
|
- /* To make sure we don't get any weird locking issues here:
|
|
|
- * destroy_conntrack() MUST NOT be called with a write lock
|
|
|
- * to nf_conntrack_lock!!! -HW */
|
|
|
rcu_read_lock();
|
|
|
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
|
|
|
if (l4proto && l4proto->destroy)
|
|
@@ -213,19 +305,18 @@ destroy_conntrack(struct nf_conntrack *nfct)
|
|
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_disable();
|
|
|
/* Expectations will have been removed in clean_from_lists,
|
|
|
* except TFTP can create an expectation on the first packet,
|
|
|
* before connection is in the list, so we need to clean here,
|
|
|
- * too. */
|
|
|
+ * too.
|
|
|
+ */
|
|
|
nf_ct_remove_expectations(ct);
|
|
|
|
|
|
- /* We overload first tuple to link into unconfirmed or dying list.*/
|
|
|
- BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
|
|
|
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
|
|
|
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
|
|
|
|
|
|
NF_CT_STAT_INC(net, delete);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_enable();
|
|
|
|
|
|
if (ct->master)
|
|
|
nf_ct_put(ct->master);
|
|
@@ -237,17 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
|
|
|
static void nf_ct_delete_from_lists(struct nf_conn *ct)
|
|
|
{
|
|
|
struct net *net = nf_ct_net(ct);
|
|
|
+ unsigned int hash, reply_hash;
|
|
|
+ u16 zone = nf_ct_zone(ct);
|
|
|
+ unsigned int sequence;
|
|
|
|
|
|
nf_ct_helper_destroy(ct);
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
- /* Inside lock so preempt is disabled on module removal path.
|
|
|
- * Otherwise we can get spurious warnings. */
|
|
|
- NF_CT_STAT_INC(net, delete_list);
|
|
|
+
|
|
|
+ local_bh_disable();
|
|
|
+ do {
|
|
|
+ sequence = read_seqcount_begin(&net->ct.generation);
|
|
|
+ hash = hash_conntrack(net, zone,
|
|
|
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
|
|
+ reply_hash = hash_conntrack(net, zone,
|
|
|
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
|
|
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
|
|
|
+
|
|
|
clean_from_lists(ct);
|
|
|
- /* add this conntrack to the dying list */
|
|
|
- hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
|
|
|
- &net->ct.dying);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ nf_conntrack_double_unlock(hash, reply_hash);
|
|
|
+
|
|
|
+ nf_ct_add_to_dying_list(ct);
|
|
|
+
|
|
|
+ NF_CT_STAT_INC(net, delete_list);
|
|
|
+ local_bh_enable();
|
|
|
}
|
|
|
|
|
|
static void death_by_event(unsigned long ul_conntrack)
|
|
@@ -331,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
|
|
|
* Warning :
|
|
|
* - Caller must take a reference on returned object
|
|
|
* and recheck nf_ct_tuple_equal(tuple, &h->tuple)
|
|
|
- * OR
|
|
|
- * - Caller must lock nf_conntrack_lock before calling this function
|
|
|
*/
|
|
|
static struct nf_conntrack_tuple_hash *
|
|
|
____nf_conntrack_find(struct net *net, u16 zone,
|
|
@@ -408,32 +508,36 @@ EXPORT_SYMBOL_GPL(nf_conntrack_find_get);
|
|
|
|
|
|
static void __nf_conntrack_hash_insert(struct nf_conn *ct,
|
|
|
unsigned int hash,
|
|
|
- unsigned int repl_hash)
|
|
|
+ unsigned int reply_hash)
|
|
|
{
|
|
|
struct net *net = nf_ct_net(ct);
|
|
|
|
|
|
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
|
|
|
&net->ct.hash[hash]);
|
|
|
hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode,
|
|
|
- &net->ct.hash[repl_hash]);
|
|
|
+ &net->ct.hash[reply_hash]);
|
|
|
}
|
|
|
|
|
|
int
|
|
|
nf_conntrack_hash_check_insert(struct nf_conn *ct)
|
|
|
{
|
|
|
struct net *net = nf_ct_net(ct);
|
|
|
- unsigned int hash, repl_hash;
|
|
|
+ unsigned int hash, reply_hash;
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct hlist_nulls_node *n;
|
|
|
u16 zone;
|
|
|
+ unsigned int sequence;
|
|
|
|
|
|
zone = nf_ct_zone(ct);
|
|
|
- hash = hash_conntrack(net, zone,
|
|
|
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
|
|
- repl_hash = hash_conntrack(net, zone,
|
|
|
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
|
|
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_disable();
|
|
|
+ do {
|
|
|
+ sequence = read_seqcount_begin(&net->ct.generation);
|
|
|
+ hash = hash_conntrack(net, zone,
|
|
|
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
|
|
|
+ reply_hash = hash_conntrack(net, zone,
|
|
|
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
|
|
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
|
|
|
|
|
|
/* See if there's one in the list already, including reverse */
|
|
|
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
|
|
@@ -441,7 +545,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
|
|
|
&h->tuple) &&
|
|
|
zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
|
|
|
goto out;
|
|
|
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
|
|
|
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
|
|
|
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
|
|
|
&h->tuple) &&
|
|
|
zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
|
|
@@ -451,15 +555,16 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
|
|
|
smp_wmb();
|
|
|
/* The caller holds a reference to this object */
|
|
|
atomic_set(&ct->ct_general.use, 2);
|
|
|
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
|
|
|
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
|
|
|
+ nf_conntrack_double_unlock(hash, reply_hash);
|
|
|
NF_CT_STAT_INC(net, insert);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
-
|
|
|
+ local_bh_enable();
|
|
|
return 0;
|
|
|
|
|
|
out:
|
|
|
+ nf_conntrack_double_unlock(hash, reply_hash);
|
|
|
NF_CT_STAT_INC(net, insert_failed);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_enable();
|
|
|
return -EEXIST;
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
|
|
@@ -467,15 +572,22 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
|
|
|
/* deletion from this larval template list happens via nf_ct_put() */
|
|
|
void nf_conntrack_tmpl_insert(struct net *net, struct nf_conn *tmpl)
|
|
|
{
|
|
|
+ struct ct_pcpu *pcpu;
|
|
|
+
|
|
|
__set_bit(IPS_TEMPLATE_BIT, &tmpl->status);
|
|
|
__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
|
|
|
nf_conntrack_get(&tmpl->ct_general);
|
|
|
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
+ /* add this conntrack to the (per cpu) tmpl list */
|
|
|
+ local_bh_disable();
|
|
|
+ tmpl->cpu = smp_processor_id();
|
|
|
+ pcpu = per_cpu_ptr(nf_ct_net(tmpl)->ct.pcpu_lists, tmpl->cpu);
|
|
|
+
|
|
|
+ spin_lock(&pcpu->lock);
|
|
|
/* Overload tuple linked list to put us in template list. */
|
|
|
hlist_nulls_add_head_rcu(&tmpl->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
|
|
|
- &net->ct.tmpl);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ &pcpu->tmpl);
|
|
|
+ spin_unlock_bh(&pcpu->lock);
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
|
|
|
|
|
@@ -483,7 +595,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tmpl_insert);
|
|
|
int
|
|
|
__nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
{
|
|
|
- unsigned int hash, repl_hash;
|
|
|
+ unsigned int hash, reply_hash;
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct nf_conn *ct;
|
|
|
struct nf_conn_help *help;
|
|
@@ -492,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
enum ip_conntrack_info ctinfo;
|
|
|
struct net *net;
|
|
|
u16 zone;
|
|
|
+ unsigned int sequence;
|
|
|
|
|
|
ct = nf_ct_get(skb, &ctinfo);
|
|
|
net = nf_ct_net(ct);
|
|
@@ -504,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
return NF_ACCEPT;
|
|
|
|
|
|
zone = nf_ct_zone(ct);
|
|
|
- /* reuse the hash saved before */
|
|
|
- hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
|
|
|
- hash = hash_bucket(hash, net);
|
|
|
- repl_hash = hash_conntrack(net, zone,
|
|
|
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
|
|
+ local_bh_disable();
|
|
|
+
|
|
|
+ do {
|
|
|
+ sequence = read_seqcount_begin(&net->ct.generation);
|
|
|
+ /* reuse the hash saved before */
|
|
|
+ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
|
|
|
+ hash = hash_bucket(hash, net);
|
|
|
+ reply_hash = hash_conntrack(net, zone,
|
|
|
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
|
|
|
+
|
|
|
+ } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
|
|
|
|
|
|
/* We're not in hash table, and we refuse to set up related
|
|
|
- connections for unconfirmed conns. But packet copies and
|
|
|
- REJECT will give spurious warnings here. */
|
|
|
+ * connections for unconfirmed conns. But packet copies and
|
|
|
+ * REJECT will give spurious warnings here.
|
|
|
+ */
|
|
|
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
|
|
|
|
|
|
/* No external references means no one else could have
|
|
|
- confirmed us. */
|
|
|
+ * confirmed us.
|
|
|
+ */
|
|
|
NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
|
|
|
pr_debug("Confirming conntrack %p\n", ct);
|
|
|
-
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
-
|
|
|
/* We have to check the DYING flag inside the lock to prevent
|
|
|
a race against nf_ct_get_next_corpse() possibly called from
|
|
|
user context, else we insert an already 'dead' hash, blocking
|
|
|
further use of that particular connection -JM */
|
|
|
|
|
|
if (unlikely(nf_ct_is_dying(ct))) {
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ nf_conntrack_double_unlock(hash, reply_hash);
|
|
|
+ local_bh_enable();
|
|
|
return NF_ACCEPT;
|
|
|
}
|
|
|
|
|
@@ -540,14 +659,13 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
&h->tuple) &&
|
|
|
zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
|
|
|
goto out;
|
|
|
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode)
|
|
|
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode)
|
|
|
if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
|
|
|
&h->tuple) &&
|
|
|
zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)))
|
|
|
goto out;
|
|
|
|
|
|
- /* Remove from unconfirmed list */
|
|
|
- hlist_nulls_del_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
|
|
|
+ nf_ct_del_from_dying_or_unconfirmed_list(ct);
|
|
|
|
|
|
/* Timer relative to confirmation time, not original
|
|
|
setting time, otherwise we'd get timer wrap in
|
|
@@ -570,9 +688,10 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
* guarantee that no other CPU can find the conntrack before the above
|
|
|
* stores are visible.
|
|
|
*/
|
|
|
- __nf_conntrack_hash_insert(ct, hash, repl_hash);
|
|
|
+ __nf_conntrack_hash_insert(ct, hash, reply_hash);
|
|
|
+ nf_conntrack_double_unlock(hash, reply_hash);
|
|
|
NF_CT_STAT_INC(net, insert);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_enable();
|
|
|
|
|
|
help = nfct_help(ct);
|
|
|
if (help && help->helper)
|
|
@@ -583,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
return NF_ACCEPT;
|
|
|
|
|
|
out:
|
|
|
+ nf_conntrack_double_unlock(hash, reply_hash);
|
|
|
NF_CT_STAT_INC(net, insert_failed);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_enable();
|
|
|
return NF_DROP;
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
|
|
@@ -627,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
|
|
|
|
|
|
/* There's a small race here where we may free a just-assured
|
|
|
connection. Too bad: we're in trouble anyway. */
|
|
|
-static noinline int early_drop(struct net *net, unsigned int hash)
|
|
|
+static noinline int early_drop(struct net *net, unsigned int _hash)
|
|
|
{
|
|
|
/* Use oldest entry, which is roughly LRU */
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct nf_conn *ct = NULL, *tmp;
|
|
|
struct hlist_nulls_node *n;
|
|
|
- unsigned int i, cnt = 0;
|
|
|
+ unsigned int i = 0, cnt = 0;
|
|
|
int dropped = 0;
|
|
|
+ unsigned int hash, sequence;
|
|
|
+ spinlock_t *lockp;
|
|
|
|
|
|
- rcu_read_lock();
|
|
|
- for (i = 0; i < net->ct.htable_size; i++) {
|
|
|
+ local_bh_disable();
|
|
|
+restart:
|
|
|
+ sequence = read_seqcount_begin(&net->ct.generation);
|
|
|
+ hash = hash_bucket(_hash, net);
|
|
|
+ for (; i < net->ct.htable_size; i++) {
|
|
|
+ lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
|
|
|
+ spin_lock(lockp);
|
|
|
+ if (read_seqcount_retry(&net->ct.generation, sequence)) {
|
|
|
+ spin_unlock(lockp);
|
|
|
+ goto restart;
|
|
|
+ }
|
|
|
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
|
|
|
hnnode) {
|
|
|
tmp = nf_ct_tuplehash_to_ctrack(h);
|
|
|
- if (!test_bit(IPS_ASSURED_BIT, &tmp->status))
|
|
|
+ if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
|
|
|
+ !nf_ct_is_dying(tmp) &&
|
|
|
+ atomic_inc_not_zero(&tmp->ct_general.use)) {
|
|
|
ct = tmp;
|
|
|
+ break;
|
|
|
+ }
|
|
|
cnt++;
|
|
|
}
|
|
|
|
|
|
- if (ct != NULL) {
|
|
|
- if (likely(!nf_ct_is_dying(ct) &&
|
|
|
- atomic_inc_not_zero(&ct->ct_general.use)))
|
|
|
- break;
|
|
|
- else
|
|
|
- ct = NULL;
|
|
|
- }
|
|
|
+ hash = (hash + 1) % net->ct.htable_size;
|
|
|
+ spin_unlock(lockp);
|
|
|
|
|
|
- if (cnt >= NF_CT_EVICTION_RANGE)
|
|
|
+ if (ct || cnt >= NF_CT_EVICTION_RANGE)
|
|
|
break;
|
|
|
|
|
|
- hash = (hash + 1) % net->ct.htable_size;
|
|
|
}
|
|
|
- rcu_read_unlock();
|
|
|
+ local_bh_enable();
|
|
|
|
|
|
if (!ct)
|
|
|
return dropped;
|
|
@@ -708,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
|
|
|
|
|
|
if (nf_conntrack_max &&
|
|
|
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
|
|
|
- if (!early_drop(net, hash_bucket(hash, net))) {
|
|
|
+ if (!early_drop(net, hash)) {
|
|
|
atomic_dec(&net->ct.count);
|
|
|
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
|
|
|
return ERR_PTR(-ENOMEM);
|
|
@@ -805,7 +934,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
|
|
|
struct nf_conn_help *help;
|
|
|
struct nf_conntrack_tuple repl_tuple;
|
|
|
struct nf_conntrack_ecache *ecache;
|
|
|
- struct nf_conntrack_expect *exp;
|
|
|
+ struct nf_conntrack_expect *exp = NULL;
|
|
|
u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
|
|
|
struct nf_conn_timeout *timeout_ext;
|
|
|
unsigned int *timeouts;
|
|
@@ -849,42 +978,44 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
|
|
|
ecache ? ecache->expmask : 0,
|
|
|
GFP_ATOMIC);
|
|
|
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
- exp = nf_ct_find_expectation(net, zone, tuple);
|
|
|
- if (exp) {
|
|
|
- pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
|
|
|
- ct, exp);
|
|
|
- /* Welcome, Mr. Bond. We've been expecting you... */
|
|
|
- __set_bit(IPS_EXPECTED_BIT, &ct->status);
|
|
|
- ct->master = exp->master;
|
|
|
- if (exp->helper) {
|
|
|
- help = nf_ct_helper_ext_add(ct, exp->helper,
|
|
|
- GFP_ATOMIC);
|
|
|
- if (help)
|
|
|
- rcu_assign_pointer(help->helper, exp->helper);
|
|
|
- }
|
|
|
+ local_bh_disable();
|
|
|
+ if (net->ct.expect_count) {
|
|
|
+ spin_lock(&nf_conntrack_expect_lock);
|
|
|
+ exp = nf_ct_find_expectation(net, zone, tuple);
|
|
|
+ if (exp) {
|
|
|
+ pr_debug("conntrack: expectation arrives ct=%p exp=%p\n",
|
|
|
+ ct, exp);
|
|
|
+ /* Welcome, Mr. Bond. We've been expecting you... */
|
|
|
+ __set_bit(IPS_EXPECTED_BIT, &ct->status);
|
|
|
+ /* exp->master safe, refcnt bumped in nf_ct_find_expectation */
|
|
|
+ ct->master = exp->master;
|
|
|
+ if (exp->helper) {
|
|
|
+ help = nf_ct_helper_ext_add(ct, exp->helper,
|
|
|
+ GFP_ATOMIC);
|
|
|
+ if (help)
|
|
|
+ rcu_assign_pointer(help->helper, exp->helper);
|
|
|
+ }
|
|
|
|
|
|
#ifdef CONFIG_NF_CONNTRACK_MARK
|
|
|
- ct->mark = exp->master->mark;
|
|
|
+ ct->mark = exp->master->mark;
|
|
|
#endif
|
|
|
#ifdef CONFIG_NF_CONNTRACK_SECMARK
|
|
|
- ct->secmark = exp->master->secmark;
|
|
|
+ ct->secmark = exp->master->secmark;
|
|
|
#endif
|
|
|
- nf_conntrack_get(&ct->master->ct_general);
|
|
|
- NF_CT_STAT_INC(net, expect_new);
|
|
|
- } else {
|
|
|
+ NF_CT_STAT_INC(net, expect_new);
|
|
|
+ }
|
|
|
+ spin_unlock(&nf_conntrack_expect_lock);
|
|
|
+ }
|
|
|
+ if (!exp) {
|
|
|
__nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC);
|
|
|
NF_CT_STAT_INC(net, new);
|
|
|
}
|
|
|
|
|
|
/* Now it is inserted into the unconfirmed list, bump refcount */
|
|
|
nf_conntrack_get(&ct->ct_general);
|
|
|
+ nf_ct_add_to_unconfirmed_list(ct);
|
|
|
|
|
|
- /* Overload tuple linked list to put us in unconfirmed list. */
|
|
|
- hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
|
|
|
- &net->ct.unconfirmed);
|
|
|
-
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ local_bh_enable();
|
|
|
|
|
|
if (exp) {
|
|
|
if (exp->expectfn)
|
|
@@ -1254,27 +1385,42 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct nf_conn *ct;
|
|
|
struct hlist_nulls_node *n;
|
|
|
+ int cpu;
|
|
|
+ spinlock_t *lockp;
|
|
|
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
for (; *bucket < net->ct.htable_size; (*bucket)++) {
|
|
|
- hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
|
|
|
- if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
|
|
|
- continue;
|
|
|
+ lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
|
|
|
+ local_bh_disable();
|
|
|
+ spin_lock(lockp);
|
|
|
+ if (*bucket < net->ct.htable_size) {
|
|
|
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
|
|
|
+ if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
|
|
|
+ continue;
|
|
|
+ ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
+ if (iter(ct, data))
|
|
|
+ goto found;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ spin_unlock(lockp);
|
|
|
+ local_bh_enable();
|
|
|
+ }
|
|
|
+
|
|
|
+ for_each_possible_cpu(cpu) {
|
|
|
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
|
|
|
+
|
|
|
+ spin_lock_bh(&pcpu->lock);
|
|
|
+ hlist_nulls_for_each_entry(h, n, &pcpu->unconfirmed, hnnode) {
|
|
|
ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
if (iter(ct, data))
|
|
|
- goto found;
|
|
|
+ set_bit(IPS_DYING_BIT, &ct->status);
|
|
|
}
|
|
|
+ spin_unlock_bh(&pcpu->lock);
|
|
|
}
|
|
|
- hlist_nulls_for_each_entry(h, n, &net->ct.unconfirmed, hnnode) {
|
|
|
- ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
- if (iter(ct, data))
|
|
|
- set_bit(IPS_DYING_BIT, &ct->status);
|
|
|
- }
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
return NULL;
|
|
|
found:
|
|
|
atomic_inc(&ct->ct_general.use);
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+ spin_unlock(lockp);
|
|
|
+ local_bh_enable();
|
|
|
return ct;
|
|
|
}
|
|
|
|
|
@@ -1323,14 +1469,19 @@ static void nf_ct_release_dying_list(struct net *net)
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct nf_conn *ct;
|
|
|
struct hlist_nulls_node *n;
|
|
|
+ int cpu;
|
|
|
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
- hlist_nulls_for_each_entry(h, n, &net->ct.dying, hnnode) {
|
|
|
- ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
- /* never fails to remove them, no listeners at this point */
|
|
|
- nf_ct_kill(ct);
|
|
|
+ for_each_possible_cpu(cpu) {
|
|
|
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
|
|
|
+
|
|
|
+ spin_lock_bh(&pcpu->lock);
|
|
|
+ hlist_nulls_for_each_entry(h, n, &pcpu->dying, hnnode) {
|
|
|
+ ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
+ /* never fails to remove them, no listeners at this point */
|
|
|
+ nf_ct_kill(ct);
|
|
|
+ }
|
|
|
+ spin_unlock_bh(&pcpu->lock);
|
|
|
}
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
}
|
|
|
|
|
|
static int untrack_refs(void)
|
|
@@ -1417,6 +1568,7 @@ i_see_dead_people:
|
|
|
kmem_cache_destroy(net->ct.nf_conntrack_cachep);
|
|
|
kfree(net->ct.slabname);
|
|
|
free_percpu(net->ct.stat);
|
|
|
+ free_percpu(net->ct.pcpu_lists);
|
|
|
}
|
|
|
}
|
|
|
|
|
@@ -1469,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
|
|
|
if (!hash)
|
|
|
return -ENOMEM;
|
|
|
|
|
|
+ local_bh_disable();
|
|
|
+ nf_conntrack_all_lock();
|
|
|
+ write_seqcount_begin(&init_net.ct.generation);
|
|
|
+
|
|
|
/* Lookups in the old hash might happen in parallel, which means we
|
|
|
* might get false negatives during connection lookup. New connections
|
|
|
* created because of a false negative won't make it into the hash
|
|
|
- * though since that required taking the lock.
|
|
|
+ * though since that required taking the locks.
|
|
|
*/
|
|
|
- spin_lock_bh(&nf_conntrack_lock);
|
|
|
+
|
|
|
for (i = 0; i < init_net.ct.htable_size; i++) {
|
|
|
while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
|
|
|
h = hlist_nulls_entry(init_net.ct.hash[i].first,
|
|
@@ -1491,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
|
|
|
|
|
|
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
|
|
|
init_net.ct.hash = hash;
|
|
|
- spin_unlock_bh(&nf_conntrack_lock);
|
|
|
+
|
|
|
+ write_seqcount_end(&init_net.ct.generation);
|
|
|
+ nf_conntrack_all_unlock();
|
|
|
+ local_bh_enable();
|
|
|
|
|
|
nf_ct_free_hashtable(old_hash, old_size);
|
|
|
return 0;
|
|
@@ -1513,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
|
|
|
int nf_conntrack_init_start(void)
|
|
|
{
|
|
|
int max_factor = 8;
|
|
|
- int ret, cpu;
|
|
|
+ int i, ret, cpu;
|
|
|
+
|
|
|
+ for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++)
|
|
|
+ spin_lock_init(&nf_conntrack_locks[i]);
|
|
|
|
|
|
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
|
|
|
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */
|
|
@@ -1629,37 +1791,43 @@ void nf_conntrack_init_end(void)
|
|
|
|
|
|
int nf_conntrack_init_net(struct net *net)
|
|
|
{
|
|
|
- int ret;
|
|
|
+ int ret = -ENOMEM;
|
|
|
+ int cpu;
|
|
|
|
|
|
atomic_set(&net->ct.count, 0);
|
|
|
- INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, UNCONFIRMED_NULLS_VAL);
|
|
|
- INIT_HLIST_NULLS_HEAD(&net->ct.dying, DYING_NULLS_VAL);
|
|
|
- INIT_HLIST_NULLS_HEAD(&net->ct.tmpl, TEMPLATE_NULLS_VAL);
|
|
|
- net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
|
|
|
- if (!net->ct.stat) {
|
|
|
- ret = -ENOMEM;
|
|
|
+
|
|
|
+ net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
|
|
|
+ if (!net->ct.pcpu_lists)
|
|
|
goto err_stat;
|
|
|
+
|
|
|
+ for_each_possible_cpu(cpu) {
|
|
|
+ struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
|
|
|
+
|
|
|
+ spin_lock_init(&pcpu->lock);
|
|
|
+ INIT_HLIST_NULLS_HEAD(&pcpu->unconfirmed, UNCONFIRMED_NULLS_VAL);
|
|
|
+ INIT_HLIST_NULLS_HEAD(&pcpu->dying, DYING_NULLS_VAL);
|
|
|
+ INIT_HLIST_NULLS_HEAD(&pcpu->tmpl, TEMPLATE_NULLS_VAL);
|
|
|
}
|
|
|
|
|
|
+ net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
|
|
|
+ if (!net->ct.stat)
|
|
|
+ goto err_pcpu_lists;
|
|
|
+
|
|
|
net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net);
|
|
|
- if (!net->ct.slabname) {
|
|
|
- ret = -ENOMEM;
|
|
|
+ if (!net->ct.slabname)
|
|
|
goto err_slabname;
|
|
|
- }
|
|
|
|
|
|
net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname,
|
|
|
sizeof(struct nf_conn), 0,
|
|
|
SLAB_DESTROY_BY_RCU, NULL);
|
|
|
if (!net->ct.nf_conntrack_cachep) {
|
|
|
printk(KERN_ERR "Unable to create nf_conn slab cache\n");
|
|
|
- ret = -ENOMEM;
|
|
|
goto err_cache;
|
|
|
}
|
|
|
|
|
|
net->ct.htable_size = nf_conntrack_htable_size;
|
|
|
net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
|
|
|
if (!net->ct.hash) {
|
|
|
- ret = -ENOMEM;
|
|
|
printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
|
|
|
goto err_hash;
|
|
|
}
|
|
@@ -1701,6 +1869,8 @@ err_cache:
|
|
|
kfree(net->ct.slabname);
|
|
|
err_slabname:
|
|
|
free_percpu(net->ct.stat);
|
|
|
+err_pcpu_lists:
|
|
|
+ free_percpu(net->ct.pcpu_lists);
|
|
|
err_stat:
|
|
|
return ret;
|
|
|
}
|