|
@@ -1222,6 +1222,45 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start,
|
|
|
return 1;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Count byte values in buckets.
|
|
|
+ * This heuristic can detect textual data (configs, xml, json, html, etc).
|
|
|
+ * Because in most text-like data byte set is restricted to limited number of
|
|
|
+ * possible characters, and that restriction in most cases makes data easy to
|
|
|
+ * compress.
|
|
|
+ *
|
|
|
+ * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
|
|
|
+ * less - compressible
|
|
|
+ * more - need additional analysis
|
|
|
+ */
|
|
|
+#define BYTE_SET_THRESHOLD (64)
|
|
|
+
|
|
|
+static u32 byte_set_size(const struct heuristic_ws *ws)
|
|
|
+{
|
|
|
+ u32 i;
|
|
|
+ u32 byte_set_size = 0;
|
|
|
+
|
|
|
+ for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
|
|
|
+ if (ws->bucket[i].count > 0)
|
|
|
+ byte_set_size++;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Continue collecting count of byte values in buckets. If the byte
|
|
|
+ * set size is bigger then the threshold, it's pointless to continue,
|
|
|
+ * the detection technique would fail for this type of data.
|
|
|
+ */
|
|
|
+ for (; i < BUCKET_SIZE; i++) {
|
|
|
+ if (ws->bucket[i].count > 0) {
|
|
|
+ byte_set_size++;
|
|
|
+ if (byte_set_size > BYTE_SET_THRESHOLD)
|
|
|
+ return byte_set_size;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return byte_set_size;
|
|
|
+}
|
|
|
+
|
|
|
static bool sample_repeated_patterns(struct heuristic_ws *ws)
|
|
|
{
|
|
|
const u32 half_of_sample = ws->sample_size / 2;
|
|
@@ -1321,6 +1360,12 @@ int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
|
|
|
ws->bucket[byte].count++;
|
|
|
}
|
|
|
|
|
|
+ i = byte_set_size(ws);
|
|
|
+ if (i < BYTE_SET_THRESHOLD) {
|
|
|
+ ret = 2;
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
out:
|
|
|
__free_workspace(0, ws_list, true);
|
|
|
return ret;
|