压缩

列存压缩是CH 高性能的秘诀

一个示例, 感官认识

create database compresstiondb on cluster cluster;

不压缩

CREATE TABLE none_compression_shard ON CLUSTER cluster
(
    log_time DateTime CODEC(NONE),
    machine_name LowCardinality(String) CODEC(NONE),
    machine_group LowCardinality(String) CODEC(NONE),
    cpu_idle Nullable(Float32) CODEC(NONE),
    cpu_nice Nullable(Float32) CODEC(NONE),
    cpu_system Nullable(Float32) CODEC(NONE),
    cpu_user Nullable(Float32) CODEC(NONE),
    cpu_wio Nullable(Float32) CODEC(NONE),
    disk_free Nullable(Float32) CODEC(NONE),
    disk_total Nullable(Float32) CODEC(NONE),
    part_max_used Nullable(Float32) CODEC(NONE),
    load_fifteen Nullable(Float32) CODEC(NONE),
    load_five Nullable(Float32) CODEC(NONE),
    load_one Nullable(Float32) CODEC(NONE),
    mem_buffers Nullable(Float32) CODEC(NONE),
    mem_cached Nullable(Float32) CODEC(NONE),
    mem_free Nullable(Float32) CODEC(NONE),
    mem_shared Nullable(Float32) CODEC(NONE),
    swap_free Nullable(Float32) CODEC(NONE),
    bytes_in Nullable(Float32) CODEC(NONE),
    bytes_out Nullable(Float32) CODEC(NONE)
) ENGINE = ReplicatedMergeTree()
ORDER BY log_time;

CREATE TABLE none_compression_distributed ON CLUSTER cluster AS none_compression_shard
ENGINE = Distributed(cluster, 'compresstiondb', 'none_compression_shard', rand());

默认压缩格式

分片表（默认压缩格式）

CREATE TABLE default_compression_shard ON CLUSTER cluster
(
    log_time DateTime,
    machine_name LowCardinality(String),
    machine_group LowCardinality(String),
    cpu_idle Nullable(Float32),
    cpu_nice Nullable(Float32),
    cpu_system Nullable(Float32),
    cpu_user Nullable(Float32),
    cpu_wio Nullable(Float32),
    disk_free Nullable(Float32),
    disk_total Nullable(Float32),
    part_max_used Nullable(Float32),
    load_fifteen Nullable(Float32),
    load_five Nullable(Float32),
    load_one Nullable(Float32),
    mem_buffers Nullable(Float32),
    mem_cached Nullable(Float32),
    mem_free Nullable(Float32),
    mem_shared Nullable(Float32),
    swap_free Nullable(Float32),
    bytes_in Nullable(Float32),
    bytes_out Nullable(Float32)
) ENGINE = ReplicatedMergeTree()
ORDER BY log_time;

CREATE TABLE default_compression_distributed ON CLUSTER cluster AS default_compression_shard
ENGINE = Distributed(cluster, 'compresstiondb', 'default_compression_shard', rand());

ZSTD 压缩格式

CREATE TABLE zstd_compression_shard ON CLUSTER cluster
(
    log_time DateTime CODEC(ZSTD),
    machine_name LowCardinality(String) CODEC(ZSTD),
    machine_group LowCardinality(String) CODEC(ZSTD),
    cpu_idle Nullable(Float32) CODEC(ZSTD),
    cpu_nice Nullable(Float32) CODEC(ZSTD),
    cpu_system Nullable(Float32) CODEC(ZSTD),
    cpu_user Nullable(Float32) CODEC(ZSTD),
    cpu_wio Nullable(Float32) CODEC(ZSTD),
    disk_free Nullable(Float32) CODEC(ZSTD),
    disk_total Nullable(Float32) CODEC(ZSTD),
    part_max_used Nullable(Float32) CODEC(ZSTD),
    load_fifteen Nullable(Float32) CODEC(ZSTD),
    load_five Nullable(Float32) CODEC(ZSTD),
    load_one Nullable(Float32) CODEC(ZSTD),
    mem_buffers Nullable(Float32) CODEC(ZSTD),
    mem_cached Nullable(Float32) CODEC(ZSTD),
    mem_free Nullable(Float32) CODEC(ZSTD),
    mem_shared Nullable(Float32) CODEC(ZSTD),
    swap_free Nullable(Float32) CODEC(ZSTD),
    bytes_in Nullable(Float32) CODEC(ZSTD),
    bytes_out Nullable(Float32) CODEC(ZSTD)
) ENGINE = ReplicatedMergeTree()
ORDER BY log_time;

CREATE TABLE zstd_compression_distributed ON CLUSTER cluster AS zstd_compression_shard
ENGINE = Distributed(cluster, 'compresstiondb', 'zstd_compression_shard', rand());

ZL4 压缩格式

CREATE TABLE lz4_compression_shard ON CLUSTER cluster
(
    log_time DateTime,
    machine_name LowCardinality(String),
    machine_group LowCardinality(String),
    cpu_idle Nullable(Float32),
    cpu_nice Nullable(Float32),
    cpu_system Nullable(Float32),
    cpu_user Nullable(Float32),
    cpu_wio Nullable(Float32),
    disk_free Nullable(Float32),
    disk_total Nullable(Float32),
    part_max_used Nullable(Float32),
    load_fifteen Nullable(Float32),
    load_five Nullable(Float32),
    load_one Nullable(Float32),
    mem_buffers Nullable(Float32),
    mem_cached Nullable(Float32),
    mem_free Nullable(Float32),
    mem_shared Nullable(Float32),
    swap_free Nullable(Float32),
    bytes_in Nullable(Float32),
    bytes_out Nullable(Float32)
) ENGINE = ReplicatedMergeTree()
ORDER BY log_time;

CREATE TABLE lz4_compression_distributed ON CLUSTER cluster AS lz4_compression_shard
ENGINE = Distributed(cluster, 'compresstiondb', 'lz4_compression_shard', rand());

插入样例数据

INSERT INTO none_compression_distributed
SELECT now(), 
       'machine_' || toString(number % 10), 
       'group_' || toString(number % 5),
       rand() % 100, rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 1000, rand() % 1000, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100
FROM numbers(10000000);

INSERT INTO default_compression_distributed
SELECT now(), 
       'machine_' || toString(number % 10), 
       'group_' || toString(number % 5),
       rand() % 100, rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 1000, rand() % 1000, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100
FROM numbers(10000000);


INSERT INTO lz4_compression_distributed
SELECT now(), 
       'machine_' || toString(number % 10), 
       'group_' || toString(number % 5),
       rand() % 100, rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 1000, rand() % 1000, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100
FROM numbers(10000000);

INSERT INTO zstd_compression_distributed
SELECT now(), 
       'machine_' || toString(number % 10), 
       'group_' || toString(number % 5),
       rand() % 100, rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 1000, rand() % 1000, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100, rand() % 100, rand() % 100, rand() % 100,
       rand() % 100
FROM numbers(10000000);

查看结果集

SELECT
    `table`,
    formatReadableSize(sum(bytes_on_disk)) AS total_size
FROM system.parts
WHERE (database = 'compresstiondb') AND (`table` LIKE '%compression%')
GROUP BY `table`
ORDER BY total_size DESC

Query id: 8f315da9-83f5-4fce-a201-9e653341ce48

   ┌─table─────────────────────┬─total_size─┐
1. │ none_compression_shard    │ 745.91 MiB │
2. │ default_compression_shard │ 417.31 MiB │
3. │ lz4_compression_shard     │ 417.24 MiB │
4. │ zstd_compression_shard    │ 197.74 MiB │
   └───────────────────────────┴────────────┘

压缩原理

影响压缩的因素

ordering key
data types
codecs

常见编码

Delta

特点: 适用于整数和日期时间类型的数据，尤其是连续值之间差异较小的情况。应用场景: 单调递增或递减的序列，如时间戳、ID。

DoubleDelta

特点: 在 Delta 编码基础上进一步压缩，适用于连续的差异本身较小的数据。

应用场景: 二阶差分数据，如股票价格变化。

Gorilla

特点: 适用于浮点数数据，特别是具有随机峰值的度量数据。

应用场景: 传感器数据、指标读数。

64

特点: 适用于稀疏数据或小范围数据。

应用场景: 稀疏矩阵、分类数据。

压缩算法

value   name    description
0x02    None    No compression, only checksums
0x82    LZ4 Extremely fast, good compression
0x90    ZSTD    Zstandard, pretty fast, best compression