|
|
@@ -72,12 +72,24 @@ EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
|
|
|
struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
|
|
|
EXPORT_SYMBOL_GPL(nf_conntrack_hash);
|
|
|
|
|
|
+struct conntrack_gc_work {
|
|
|
+ struct delayed_work dwork;
|
|
|
+ u32 last_bucket;
|
|
|
+ bool exiting;
|
|
|
+};
|
|
|
+
|
|
|
static __read_mostly struct kmem_cache *nf_conntrack_cachep;
|
|
|
static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
|
|
|
-static __read_mostly seqcount_t nf_conntrack_generation;
|
|
|
static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
|
|
|
static __read_mostly bool nf_conntrack_locks_all;
|
|
|
|
|
|
+#define GC_MAX_BUCKETS_DIV 64u
|
|
|
+#define GC_MAX_BUCKETS 8192u
|
|
|
+#define GC_INTERVAL (5 * HZ)
|
|
|
+#define GC_MAX_EVICTS 256u
|
|
|
+
|
|
|
+static struct conntrack_gc_work conntrack_gc_work;
|
|
|
+
|
|
|
void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
|
|
|
{
|
|
|
spin_lock(lock);
|
|
|
@@ -164,7 +176,7 @@ unsigned int nf_conntrack_htable_size __read_mostly;
|
|
|
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
|
|
|
|
|
|
unsigned int nf_conntrack_max __read_mostly;
|
|
|
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
|
|
|
+seqcount_t nf_conntrack_generation __read_mostly;
|
|
|
|
|
|
DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
|
|
|
EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
|
|
|
@@ -372,7 +384,6 @@ destroy_conntrack(struct nf_conntrack *nfct)
|
|
|
|
|
|
pr_debug("destroy_conntrack(%p)\n", ct);
|
|
|
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
|
|
|
- NF_CT_ASSERT(!timer_pending(&ct->timeout));
|
|
|
|
|
|
if (unlikely(nf_ct_is_template(ct))) {
|
|
|
nf_ct_tmpl_free(ct);
|
|
|
@@ -435,35 +446,30 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report)
|
|
|
{
|
|
|
struct nf_conn_tstamp *tstamp;
|
|
|
|
|
|
+ if (test_and_set_bit(IPS_DYING_BIT, &ct->status))
|
|
|
+ return false;
|
|
|
+
|
|
|
tstamp = nf_conn_tstamp_find(ct);
|
|
|
if (tstamp && tstamp->stop == 0)
|
|
|
tstamp->stop = ktime_get_real_ns();
|
|
|
|
|
|
- if (nf_ct_is_dying(ct))
|
|
|
- goto delete;
|
|
|
-
|
|
|
if (nf_conntrack_event_report(IPCT_DESTROY, ct,
|
|
|
portid, report) < 0) {
|
|
|
- /* destroy event was not delivered */
|
|
|
+ /* destroy event was not delivered. nf_ct_put will
|
|
|
+ * be done by event cache worker on redelivery.
|
|
|
+ */
|
|
|
nf_ct_delete_from_lists(ct);
|
|
|
nf_conntrack_ecache_delayed_work(nf_ct_net(ct));
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
nf_conntrack_ecache_work(nf_ct_net(ct));
|
|
|
- set_bit(IPS_DYING_BIT, &ct->status);
|
|
|
- delete:
|
|
|
nf_ct_delete_from_lists(ct);
|
|
|
nf_ct_put(ct);
|
|
|
return true;
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(nf_ct_delete);
|
|
|
|
|
|
-static void death_by_timeout(unsigned long ul_conntrack)
|
|
|
-{
|
|
|
- nf_ct_delete((struct nf_conn *)ul_conntrack, 0, 0);
|
|
|
-}
|
|
|
-
|
|
|
static inline bool
|
|
|
nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
|
|
|
const struct nf_conntrack_tuple *tuple,
|
|
|
@@ -481,22 +487,17 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
|
|
|
net_eq(net, nf_ct_net(ct));
|
|
|
}
|
|
|
|
|
|
-/* must be called with rcu read lock held */
|
|
|
-void nf_conntrack_get_ht(struct hlist_nulls_head **hash, unsigned int *hsize)
|
|
|
+/* caller must hold rcu readlock and none of the nf_conntrack_locks */
|
|
|
+static void nf_ct_gc_expired(struct nf_conn *ct)
|
|
|
{
|
|
|
- struct hlist_nulls_head *hptr;
|
|
|
- unsigned int sequence, hsz;
|
|
|
+ if (!atomic_inc_not_zero(&ct->ct_general.use))
|
|
|
+ return;
|
|
|
|
|
|
- do {
|
|
|
- sequence = read_seqcount_begin(&nf_conntrack_generation);
|
|
|
- hsz = nf_conntrack_htable_size;
|
|
|
- hptr = nf_conntrack_hash;
|
|
|
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
|
|
|
+ if (nf_ct_should_gc(ct))
|
|
|
+ nf_ct_kill(ct);
|
|
|
|
|
|
- *hash = hptr;
|
|
|
- *hsize = hsz;
|
|
|
+ nf_ct_put(ct);
|
|
|
}
|
|
|
-EXPORT_SYMBOL_GPL(nf_conntrack_get_ht);
|
|
|
|
|
|
/*
|
|
|
* Warning :
|
|
|
@@ -510,16 +511,24 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone,
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct hlist_nulls_head *ct_hash;
|
|
|
struct hlist_nulls_node *n;
|
|
|
- unsigned int bucket, sequence;
|
|
|
+ unsigned int bucket, hsize;
|
|
|
|
|
|
begin:
|
|
|
- do {
|
|
|
- sequence = read_seqcount_begin(&nf_conntrack_generation);
|
|
|
- bucket = scale_hash(hash);
|
|
|
- ct_hash = nf_conntrack_hash;
|
|
|
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
|
|
|
+ nf_conntrack_get_ht(&ct_hash, &hsize);
|
|
|
+ bucket = reciprocal_scale(hash, hsize);
|
|
|
|
|
|
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) {
|
|
|
+ struct nf_conn *ct;
|
|
|
+
|
|
|
+ ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
+ if (nf_ct_is_expired(ct)) {
|
|
|
+ nf_ct_gc_expired(ct);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (nf_ct_is_dying(ct))
|
|
|
+ continue;
|
|
|
+
|
|
|
if (nf_ct_key_equal(h, tuple, zone, net)) {
|
|
|
NF_CT_STAT_INC_ATOMIC(net, found);
|
|
|
return h;
|
|
|
@@ -618,7 +627,6 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
|
|
|
zone, net))
|
|
|
goto out;
|
|
|
|
|
|
- add_timer(&ct->timeout);
|
|
|
smp_wmb();
|
|
|
/* The caller holds a reference to this object */
|
|
|
atomic_set(&ct->ct_general.use, 2);
|
|
|
@@ -771,8 +779,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
|
|
|
/* Timer relative to confirmation time, not original
|
|
|
setting time, otherwise we'd get timer wrap in
|
|
|
weird delay cases. */
|
|
|
- ct->timeout.expires += jiffies;
|
|
|
- add_timer(&ct->timeout);
|
|
|
+ ct->timeout += nfct_time_stamp;
|
|
|
atomic_inc(&ct->ct_general.use);
|
|
|
ct->status |= IPS_CONFIRMED;
|
|
|
|
|
|
@@ -823,29 +830,41 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
|
|
|
const struct nf_conntrack_zone *zone;
|
|
|
struct nf_conntrack_tuple_hash *h;
|
|
|
struct hlist_nulls_head *ct_hash;
|
|
|
- unsigned int hash, sequence;
|
|
|
+ unsigned int hash, hsize;
|
|
|
struct hlist_nulls_node *n;
|
|
|
struct nf_conn *ct;
|
|
|
|
|
|
zone = nf_ct_zone(ignored_conntrack);
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- do {
|
|
|
- sequence = read_seqcount_begin(&nf_conntrack_generation);
|
|
|
- hash = hash_conntrack(net, tuple);
|
|
|
- ct_hash = nf_conntrack_hash;
|
|
|
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
|
|
|
+ begin:
|
|
|
+ nf_conntrack_get_ht(&ct_hash, &hsize);
|
|
|
+ hash = __hash_conntrack(net, tuple, hsize);
|
|
|
|
|
|
hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) {
|
|
|
ct = nf_ct_tuplehash_to_ctrack(h);
|
|
|
- if (ct != ignored_conntrack &&
|
|
|
- nf_ct_key_equal(h, tuple, zone, net)) {
|
|
|
+
|
|
|
+ if (ct == ignored_conntrack)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ if (nf_ct_is_expired(ct)) {
|
|
|
+ nf_ct_gc_expired(ct);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (nf_ct_key_equal(h, tuple, zone, net)) {
|
|
|
NF_CT_STAT_INC_ATOMIC(net, found);
|
|
|
rcu_read_unlock();
|
|
|
return 1;
|
|
|
}
|
|
|
NF_CT_STAT_INC_ATOMIC(net, searched);
|
|
|
}
|
|
|
+
|
|
|
+ if (get_nulls_value(n) != hash) {
|
|
|
+ NF_CT_STAT_INC_ATOMIC(net, search_restart);
|
|
|
+ goto begin;
|
|
|
+ }
|
|
|
+
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
return 0;
|
|
|
@@ -867,6 +886,11 @@ static unsigned int early_drop_list(struct net *net,
|
|
|
hlist_nulls_for_each_entry_rcu(h, n, head, hnnode) {
|
|
|
tmp = nf_ct_tuplehash_to_ctrack(h);
|
|
|
|
|
|
+ if (nf_ct_is_expired(tmp)) {
|
|
|
+ nf_ct_gc_expired(tmp);
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
if (test_bit(IPS_ASSURED_BIT, &tmp->status) ||
|
|
|
!net_eq(nf_ct_net(tmp), net) ||
|
|
|
nf_ct_is_dying(tmp))
|
|
|
@@ -884,7 +908,6 @@ static unsigned int early_drop_list(struct net *net,
|
|
|
*/
|
|
|
if (net_eq(nf_ct_net(tmp), net) &&
|
|
|
nf_ct_is_confirmed(tmp) &&
|
|
|
- del_timer(&tmp->timeout) &&
|
|
|
nf_ct_delete(tmp, 0, 0))
|
|
|
drops++;
|
|
|
|
|
|
@@ -900,14 +923,11 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
|
|
|
|
|
|
for (i = 0; i < NF_CT_EVICTION_RANGE; i++) {
|
|
|
struct hlist_nulls_head *ct_hash;
|
|
|
- unsigned hash, sequence, drops;
|
|
|
+ unsigned int hash, hsize, drops;
|
|
|
|
|
|
rcu_read_lock();
|
|
|
- do {
|
|
|
- sequence = read_seqcount_begin(&nf_conntrack_generation);
|
|
|
- hash = scale_hash(_hash++);
|
|
|
- ct_hash = nf_conntrack_hash;
|
|
|
- } while (read_seqcount_retry(&nf_conntrack_generation, sequence));
|
|
|
+ nf_conntrack_get_ht(&ct_hash, &hsize);
|
|
|
+ hash = reciprocal_scale(_hash++, hsize);
|
|
|
|
|
|
drops = early_drop_list(net, &ct_hash[hash]);
|
|
|
rcu_read_unlock();
|
|
|
@@ -921,6 +941,69 @@ static noinline int early_drop(struct net *net, unsigned int _hash)
|
|
|
return false;
|
|
|
}
|
|
|
|
|
|
+static void gc_worker(struct work_struct *work)
|
|
|
+{
|
|
|
+ unsigned int i, goal, buckets = 0, expired_count = 0;
|
|
|
+ unsigned long next_run = GC_INTERVAL;
|
|
|
+ unsigned int ratio, scanned = 0;
|
|
|
+ struct conntrack_gc_work *gc_work;
|
|
|
+
|
|
|
+ gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
|
|
|
+
|
|
|
+ goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS);
|
|
|
+ i = gc_work->last_bucket;
|
|
|
+
|
|
|
+ do {
|
|
|
+ struct nf_conntrack_tuple_hash *h;
|
|
|
+ struct hlist_nulls_head *ct_hash;
|
|
|
+ struct hlist_nulls_node *n;
|
|
|
+ unsigned int hashsz;
|
|
|
+ struct nf_conn *tmp;
|
|
|
+
|
|
|
+ i++;
|
|
|
+ rcu_read_lock();
|
|
|
+
|
|
|
+ nf_conntrack_get_ht(&ct_hash, &hashsz);
|
|
|
+ if (i >= hashsz)
|
|
|
+ i = 0;
|
|
|
+
|
|
|
+ hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
|
|
|
+ tmp = nf_ct_tuplehash_to_ctrack(h);
|
|
|
+
|
|
|
+ scanned++;
|
|
|
+ if (nf_ct_is_expired(tmp)) {
|
|
|
+ nf_ct_gc_expired(tmp);
|
|
|
+ expired_count++;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /* could check get_nulls_value() here and restart if ct
|
|
|
+ * was moved to another chain. But given gc is best-effort
|
|
|
+ * we will just continue with next hash slot.
|
|
|
+ */
|
|
|
+ rcu_read_unlock();
|
|
|
+ cond_resched_rcu_qs();
|
|
|
+ } while (++buckets < goal &&
|
|
|
+ expired_count < GC_MAX_EVICTS);
|
|
|
+
|
|
|
+ if (gc_work->exiting)
|
|
|
+ return;
|
|
|
+
|
|
|
+ ratio = scanned ? expired_count * 100 / scanned : 0;
|
|
|
+ if (ratio >= 90)
|
|
|
+ next_run = 0;
|
|
|
+
|
|
|
+ gc_work->last_bucket = i;
|
|
|
+ schedule_delayed_work(&gc_work->dwork, next_run);
|
|
|
+}
|
|
|
+
|
|
|
+static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
|
|
|
+{
|
|
|
+ INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
|
|
|
+ gc_work->exiting = false;
|
|
|
+}
|
|
|
+
|
|
|
static struct nf_conn *
|
|
|
__nf_conntrack_alloc(struct net *net,
|
|
|
const struct nf_conntrack_zone *zone,
|
|
|
@@ -957,8 +1040,6 @@ __nf_conntrack_alloc(struct net *net,
|
|
|
/* save hash for reusing when confirming */
|
|
|
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
|
|
|
ct->status = 0;
|
|
|
- /* Don't set timer yet: wait for confirmation */
|
|
|
- setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct);
|
|
|
write_pnet(&ct->ct_net, net);
|
|
|
memset(&ct->__nfct_init_offset[0], 0,
|
|
|
offsetof(struct nf_conn, proto) -
|
|
|
@@ -1332,7 +1413,6 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
|
|
|
unsigned long extra_jiffies,
|
|
|
int do_acct)
|
|
|
{
|
|
|
- NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
|
|
|
NF_CT_ASSERT(skb);
|
|
|
|
|
|
/* Only update if this is not a fixed timeout */
|
|
|
@@ -1340,39 +1420,25 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
|
|
|
goto acct;
|
|
|
|
|
|
/* If not in hash table, timer will not be active yet */
|
|
|
- if (!nf_ct_is_confirmed(ct)) {
|
|
|
- ct->timeout.expires = extra_jiffies;
|
|
|
- } else {
|
|
|
- unsigned long newtime = jiffies + extra_jiffies;
|
|
|
-
|
|
|
- /* Only update the timeout if the new timeout is at least
|
|
|
- HZ jiffies from the old timeout. Need del_timer for race
|
|
|
- avoidance (may already be dying). */
|
|
|
- if (newtime - ct->timeout.expires >= HZ)
|
|
|
- mod_timer_pending(&ct->timeout, newtime);
|
|
|
- }
|
|
|
+ if (nf_ct_is_confirmed(ct))
|
|
|
+ extra_jiffies += nfct_time_stamp;
|
|
|
|
|
|
+ ct->timeout = extra_jiffies;
|
|
|
acct:
|
|
|
if (do_acct)
|
|
|
nf_ct_acct_update(ct, ctinfo, skb->len);
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
|
|
|
|
|
|
-bool __nf_ct_kill_acct(struct nf_conn *ct,
|
|
|
- enum ip_conntrack_info ctinfo,
|
|
|
- const struct sk_buff *skb,
|
|
|
- int do_acct)
|
|
|
+bool nf_ct_kill_acct(struct nf_conn *ct,
|
|
|
+ enum ip_conntrack_info ctinfo,
|
|
|
+ const struct sk_buff *skb)
|
|
|
{
|
|
|
- if (do_acct)
|
|
|
- nf_ct_acct_update(ct, ctinfo, skb->len);
|
|
|
+ nf_ct_acct_update(ct, ctinfo, skb->len);
|
|
|
|
|
|
- if (del_timer(&ct->timeout)) {
|
|
|
- ct->timeout.function((unsigned long)ct);
|
|
|
- return true;
|
|
|
- }
|
|
|
- return false;
|
|
|
+ return nf_ct_delete(ct, 0, 0);
|
|
|
}
|
|
|
-EXPORT_SYMBOL_GPL(__nf_ct_kill_acct);
|
|
|
+EXPORT_SYMBOL_GPL(nf_ct_kill_acct);
|
|
|
|
|
|
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
|
|
|
|
|
|
@@ -1505,11 +1571,8 @@ void nf_ct_iterate_cleanup(struct net *net,
|
|
|
|
|
|
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
|
|
|
/* Time to push up daises... */
|
|
|
- if (del_timer(&ct->timeout))
|
|
|
- nf_ct_delete(ct, portid, report);
|
|
|
-
|
|
|
- /* ... else the timer will get him soon. */
|
|
|
|
|
|
+ nf_ct_delete(ct, portid, report);
|
|
|
nf_ct_put(ct);
|
|
|
cond_resched();
|
|
|
}
|
|
|
@@ -1545,6 +1608,7 @@ static int untrack_refs(void)
|
|
|
|
|
|
void nf_conntrack_cleanup_start(void)
|
|
|
{
|
|
|
+ conntrack_gc_work.exiting = true;
|
|
|
RCU_INIT_POINTER(ip_ct_attach, NULL);
|
|
|
}
|
|
|
|
|
|
@@ -1554,6 +1618,7 @@ void nf_conntrack_cleanup_end(void)
|
|
|
while (untrack_refs() > 0)
|
|
|
schedule();
|
|
|
|
|
|
+ cancel_delayed_work_sync(&conntrack_gc_work.dwork);
|
|
|
nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
|
|
|
|
|
|
nf_conntrack_proto_fini();
|
|
|
@@ -1828,6 +1893,10 @@ int nf_conntrack_init_start(void)
|
|
|
}
|
|
|
/* - and look it like as a confirmed connection */
|
|
|
nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED);
|
|
|
+
|
|
|
+ conntrack_gc_work_init(&conntrack_gc_work);
|
|
|
+ schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL);
|
|
|
+
|
|
|
return 0;
|
|
|
|
|
|
err_proto:
|