Skip to content

Clickhouse

Install

mkdir -p /docker/clickhouse/config
mkdir -p /docker/clickhouse/data
touch /docker/clickhouse/config/config.xml 
touch /docker/clickhouse/config/users.xml

docker run -d --name clickhouse-server \
    --ulimit nofile=262144:262144 \
    -p 8123:8123 \
    -p 9000:9000 \
    -v /docker/clickhouse/data:/var/lib/clickhouse \
    --restart unless-stopped \
    clickhouse/clickhouse-server


docker exec -it clickhouse-server clickhouse-client --query "SELECT name, value FROM system.settings WHERE name IN ('max_memory_usage', 'max_concurrent_queries')"


docker exec -it clickhouse-server clickhouse-client

> create database qbdata_db;
> CREATE USER qbdata_db IDENTIFIED WITH plaintext_password BY '***';
> GRANT ALL ON qbdata_db.* TO qbdata_db;

# verify
SHOW DATABASES; 
SHOW USERS;
# example
-- Create a database
CREATE DATABASE my_database;
-- Create a user
CREATE USER my_user IDENTIFIED WITH plaintext_password BY 'my_password';
-- Grant privileges
GRANT ALL ON my_database.* TO my_user;

> EXIT;

Commands

Functions

  • has
    -- or/hasany
    SELECT count(1) from middle_source_plus msp where hasAny(msp.array_brand, ['宝马', '比亚迪']); 
    -- and /hasall
    SELECT count(1) from middle_source_plus msp where hasall(msp.array_brand, ['宝马', '比亚迪']); 
    -- equeal/ has
    SELECT count(1) from middle_source_plus msp where has(msp.array_brand, '比亚迪'); 
    

Performance

  • Limit 小批次更新

  • 投影

    ALTER TABLE my_table
    ADD PROJECTION my_projection
    (
        SELECT column1, column2, sum(column3)
        GROUP BY column1, column2
    );
    

  • 查看merge操作

    SELECT * FROM system.merges;
    

  • 限制内存

    SELECT name, value
    FROM system.settings
    WHERE name LIKE '%max_memory%';
    
    SET max_memory_usage = 10000000000; -- 例如设置更高的限制,10 GB
    

  • 内存溢出

    -- 查看参数
    SELECT name, value
    FROM system.settings
    WHERE name LIKE '%max_bytes_before_external%';
    -- 在内存不足时将 溢出内存数据写入磁盘;
    SET max_bytes_before_external_sort = 1000000000; -- 1 GB
    SET max_bytes_before_external_group_by = 1000000000; -- 1 GB
    

投影

-- 在现有表上添加投影
ALTER TABLE table_name
    ADD PROJECTION projection_name
    (
        SELECT col1, col2, sum(col3) as sum_col3
        GROUP BY col1, col2
    );

-- 在创建表时定义投影
CREATE TABLE table_name
(
    col1 String,
    col2 String,
    col3 Int32
)
ENGINE = MergeTree()
ORDER BY (col1)
SETTINGS index_granularity = 8192
PROJECTIONS
(
    proj1
    (
        SELECT col1, col2, sum(col3)
        GROUP BY col1, col2
    )
);

物化视图(MATERIALIZED VIEW)

  • create data table
CREATE TABLE qbdata_db.test_sales
(
    `sale_year` String COMMENT '年份',
    `sale_month` String COMMENT '月份',
    `sale_group_id` String COMMENT '集团ID',
    `sale_country` String COMMENT '销售国家',
    `sale_quantity` UInt32 COMMENT '销量'

)
ENGINE = MergeTree()
ORDER BY sale_group_id
SETTINGS index_granularity = 8192;
  • Create Materialized View(Below...)

  • Insert And Select

INSERT INTO test_sales (sale_year, sale_month, sale_group_id, sale_country, sale_quantity)
VALUES (2024, 8, 1, 'China', 100);

INSERT INTO test_sales (sale_year, sale_month, sale_group_id, sale_country, sale_quantity)
VALUES (2024, 8, 1, 'China', 50);

INSERT INTO test_sales (sale_year, sale_month, sale_group_id, sale_country, sale_quantity)
VALUES (2024, 8, 2, 'China', 50);

INSERT INTO test_sales (sale_year, sale_month, sale_group_id, sale_country, sale_quantity)
VALUES (2024, 8, 2, 'China', 20);

MATERIALIZED VIEW(SummingMergeTree)

  • Create View(SummingMergeTree)

    会对order by的相同主键行的 group_quantity 自动求和。

CREATE MATERIALIZED VIEW test_view_sales_ymgc
ENGINE = SummingMergeTree()
ORDER BY (sale_year, sale_month, sale_group_id, sale_country)
POPULATE AS
SELECT
    toInt32(sale_year) AS sale_year,
    toInt32(sale_month) AS sale_month,
    sale_group_id,
    sale_country,
    sale_quantity AS group_quantity
FROM test_sales;
  • Select

SELECT * FROM test_view_sales_ymgc;
Alt text

-- with group by
SELECT
    sale_year,
    sale_month,
    sale_group_id,
    sale_country,
    SUM(group_quantity) AS total_quantity
FROM test_view_sales_ymgc
GROUP BY sale_year, sale_month, sale_group_id, sale_country;
Alt text

  • 强制合并后查询
    OPTIMIZE TABLE test_view_sales_ymgc FINAL;
    
    SELECT * FROM test_view_sales_ymgc;
    
    Alt text
Hint
  • SummingMergeTree 引擎的工作原理:

    • 该引擎会在后台合并过程中,对具有相同主键的行的数值列进行求和。
    • 数据的最终一致性是通过后台合并实现的,可能不会立即反映在查询结果中。
    • 避免在物化视图的 SELECT 中使用聚合函数和 GROUP BY

    • 让引擎自动处理聚合,否则可能导致数据重复累加或结果不正确。

    • 查询时的聚合:

    • 如果需要立即查看聚合后的结果,可以在查询时使用 GROUP BYSUM

    • 这样可以避免等待后台合并完成。

MATERIALIZED VIEW(AggregatingMergeTree)

  • Create View
CREATE MATERIALIZED VIEW test_view_sales_agg
ENGINE = AggregatingMergeTree()
ORDER BY (sale_year, sale_month, sale_group_id, sale_country)
AS
SELECT
    sale_year,
    sale_month,
    sale_group_id,
    sale_country,
    sumState(sale_quantity) AS sum_quantity_state,
    maxState(sale_quantity) AS max_quantity_state,
    minState(sale_quantity) AS min_quantity_state
FROM test_sales
GROUP BY sale_year, sale_month, sale_group_id, sale_country;
  • Insert History Data(Optional)
INSERT INTO test_view_sales_agg
SELECT
    sale_year,
    sale_month,
    sale_group_id,
    sale_country,
    sumState(sale_quantity) AS sum_quantity_state,
    maxState(sale_quantity) AS max_quantity_state,
    minState(sale_quantity) AS min_quantity_state
FROM test_sales
GROUP BY sale_year, sale_month, sale_group_id, sale_country;
  • Select

    由于 AggregatingMergeTree 存储的是聚合函数的状态,需要在查询时应用 ...Merge() 函数来获取最终结果。

SELECT
    sale_year,
    sale_month,
    sale_group_id,
    sale_country,
    sumMerge(sum_quantity_state) AS total_quantity,
    maxMerge(max_quantity_state) AS max_quantity,
    minMerge(min_quantity_state) AS min_quantity
FROM test_view_sales_agg
GROUP BY sale_year, sale_month, sale_group_id, sale_country;

Alt text

  • sumMerge()maxMerge()minMerge():这些函数用于从存储的聚合状态中计算最终结果。
  • GROUP BY:在查询中需要再次使用 GROUP BY,以确保正确合并聚合状态。
Hint
  • 聚合函数的状态存储:

    • 在使用 AggregatingMergeTree 时,聚合函数的状态(如 sumState)被存储为专门的数据类型 AggregateFunction
    • 在查询时,需要使用对应的 ...Merge 函数来计算最终结果。
    • 数据类型:

    • 确保数值列的数据类型正确,例如 Int32Float64 等。

    • 聚合函数状态列的数据类型会自动推导为 AggregateFunction 类型。
    • 物化视图的更新:

    • 物化视图会在源表有新数据插入时自动更新,不需要手动刷新。

    • 由于物化视图是异步更新的,可能会有微小的延迟。
    • 查询优化:

    • 在查询时使用 GROUP BY 和对应的 ...Merge 函数,可以确保获取准确的聚合结果。