citus 系列7 - topn 加速(count(*) group by order by cou
背景
postgres=# \df topn* List of functions Schema | Name | Result data type | Argument data * | Type --------+------------------+-------------------+---------------------+-------- public | topn | SETOF topn_record | jsonb, integer | normal public | topn_add | jsonb | jsonb, text | normal public | topn_add_agg | jsonb | text | agg public | topn_add_trans | internal | internal, text | normal public | topn_pack | jsonb | internal | normal public | topn_union | jsonb | jsonb, jsonb | normal public | topn_union_agg | jsonb | jsonb | agg public | topn_union_trans | internal | internal, jsonb | normal (8 rows)
-- starting from nothing, record that we saw an "a" select topn_add('{}', 'a'); -- => {"a": 1} -- record the sighting of another "a" select topn_add(topn_add('{}', 'a'), 'a'); -- => {"a": 2} -- for normal_rand create extension tablefunc; -- count values from a normal distribution SELECT topn_add_agg(floor(abs(i))::text) FROM normal_rand(1000, 5, 0.7) i; -- => {"2": 1, "3": 74, "4": 420, "5": 425, "6": 77, "7": 3}
postgres=# select (topn(topn_union_agg(agg_prodid),5)).* from reviews_by_prodid; item | frequency --------+----------- 509594 | 66 497599 | 59 505217 | 58 461257 | 58 403111 | 57 (5 rows)
使用topn
cd ~ . /var/lib/pgsql/.bash_profile git clone https://github.com/citusdata/postgresql-topn cd postgresql-topn USE_PGXS=1 make USE_PGXS=1 make install
postgres=# create extension topn; CREATE EXTENSION
postgres=# select run_command_on_workers('create extension topn;'); run_command_on_workers -------------------------------------------- (xxx.xxx.xxx.224,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.225,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.226,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.227,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.229,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.230,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.231,1921,t,"CREATE EXTENSION") (xxx.xxx.xxx.232,1921,t,"CREATE EXTENSION") (8 rows)
测试
create table tbl(id serial8,gid int, prodid int, c1 int, c2 int); postgres=# \d tbl Table "public.tbl" Column | Type | Collation | Nullable | Default --------+---------+-----------+----------+--------------------------------- id | bigint | | not null | nextval('tbl_id_seq'::regclass) gid | integer | | | prodid | integer | | | c1 | integer | | | c2 | integer | | | postgres=# alter sequence tbl_id_seq cache 10000; ALTER SEQUENCE
vi test.sql \set gid random_gaussian(1,1000,2.5) \set prodid random_gaussian(1,1000000,2.5) \set c1 random(1,3000) \set c2 random(1,100000000) insert into tbl(gid,prodid,c1,c2) values (:gid,:prodid,:c1,:c2); pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 1200
postgres=# select count(*) from tbl; count ----------- 216524755 (1 row) Time: 421.860 ms
postgres=# select gid,count(*) from tbl group by gid order by count(*) desc limit 10; gid | count -----+-------- 494 | 438102 499 | 438017 514 | 437929 506 | 437852 511 | 437546 509 | 437469 495 | 437458 490 | 437320 496 | 437257 500 | 437239 (10 rows) postgres=# select c1,count(*) from tbl group by c1 order by count(*) desc limit 10; c1 | count ------+------- 1370 | 73175 168 | 73121 1016 | 73114 1816 | 73045 1463 | 73020 585 | 72986 1529 | 72974 1857 | 72944 2580 | 72930 298 | 72917 (10 rows) postgres=# select prodid,count(*) from tbl group by prodid order by count(*) desc limit 10; prodid | count --------+------- 516916 | 534 481914 | 534 520680 | 527 530544 | 526 449685 | 523 493560 | 523 520464 | 523 502098 | 522 495170 | 522 501695 | 522 (10 rows)
CREATE TABLE reviews_by_gid ( agg jsonb ); SELECT create_reference_table('reviews_by_gid'); INSERT INTO reviews_by_gid SELECT topn_add_agg(gid::text) FROM tbl; postgres=# select (topn(agg,5)).* from reviews_by_gid; item | frequency ------+----------- 494 | 438102 499 | 438017 514 | 437929 506 | 437852 511 | 437546 (5 rows)
CREATE TABLE reviews_by_prodid ( agg_prodid jsonb ); SELECT create_reference_table('reviews_by_prodid'); INSERT INTO reviews_by_prodid SELECT topn_add_agg(prodid::text) FROM tbl; postgres=# select (topn(agg_prodid,5)).* from reviews_by_prodid; item | frequency --------+----------- 470098 | 36 531880 | 35 451724 | 34 420093 | 34 522676 | 33 (5 rows)
CREATE TABLE reviews_by_c1 ( aggc1 jsonb ); SELECT create_reference_table('reviews_by_c1'); INSERT INTO reviews_by_c1 SELECT topn_add_agg(c1::text) FROM tbl; postgres=# select (topn(aggc1,5)).* from reviews_by_c1; item | frequency ------+----------- 2580 | 37073 1016 | 36162 1983 | 35311 1752 | 35285 2354 | 34740 (5 rows)
精度、截断
/* * PruneHashTable removes some items from the HashTable to decrease its size. It finds * minimum and maximum frequencies first and removes the items which have lower frequency * than the average of them. */ static void PruneHashTable(HTAB *hashTable, int itemLimit, int numberOfRemainingElements) { Size topnArraySize = 0; int topnIndex = 0; FrequentTopnItem *sortedTopnArray = NULL; bool itemAlreadyHashed = false; HASH_SEQ_STATUS status; FrequentTopnItem *currentTask = NULL; FrequentTopnItem *frequentTopnItem = NULL; int index = 0; int hashTableSize = hash_get_num_entries(hashTable); if (hashTableSize <= itemLimit) { return; } /* create an array to copy top-n items and sort them later */ topnArraySize = sizeof(FrequentTopnItem) * hashTableSize; sortedTopnArray = (FrequentTopnItem *) palloc0(topnArraySize); hash_seq_init(&status, hashTable); while ((currentTask = (FrequentTopnItem *) hash_seq_search(&status)) != NULL) { frequentTopnItem = palloc0(sizeof(FrequentTopnItem)); memcpy(frequentTopnItem->key, currentTask->key, sizeof(frequentTopnItem->key)); frequentTopnItem->frequency = currentTask->frequency; sortedTopnArray[topnIndex] = *frequentTopnItem; topnIndex++; } qsort(sortedTopnArray, hashTableSize, sizeof(FrequentTopnItem), compareFrequentTopnItem); for (index = numberOfRemainingElements; index < hashTableSize; index++) { FrequentTopnItem *topnItem = &(sortedTopnArray[index]); hash_search(hashTable, (void *) topnItem->key, HASH_REMOVE, &itemAlreadyHashed); } }
如何修改hash table size
postgres=# load 'topn'; LOAD postgres=# show topn.number_of_counters ; topn.number_of_counters ------------------------- 1000 (1 row) set topn.number_of_counters =20000;
postgresql.conf shared_preload_libraries='citus,topn,pg_stat_statements' topn.number_of_counters=10000
小结
佳实践
参考
相关文章