Store the distribution of derivations related to packages

This might be generally useful, but I've been looking at it as it offers a way
to try and improve query performance when you want to select all the
derivations related to the packages for a revision.

The data looks like this (for a specified system and target):

┌───────┬───────┐
│ level │ count │
├───────┼───────┤
│    15 │     2 │
│    14 │     3 │
│    13 │     3 │
│    12 │     3 │
│    11 │    14 │
│    10 │    25 │
│     9 │    44 │
│     8 │    91 │
│     7 │  1084 │
│     6 │   311 │
│     5 │   432 │
│     4 │   515 │
│     3 │   548 │
│     2 │  2201 │
│     1 │ 21162 │
│     0 │ 22310 │
└───────┴───────┘

Level 0 reflects the number of packages. Level 1 is similar as you have all
the derivations for the package origins. The remaining levels contain less
packages since it's mostly just derivations involved in bootstrapping.

When using a recursive CTE to collect all the derivations, PostgreSQL assumes
that the each derivation has the same number of inputs, and this leads to a
large overestimation of the number of derivations per a revision. This in turn
can lead to PostgreSQL picking a slower way of running the query.

When it's known how many new derivations you should see at each level, it's
possible to inform PostgreSQL this by using LIMIT's at various points in the
query. This reassures the query planner that it's not going to be handling
lots of rows and helps it make better decisions about how to execute the
query.
This commit is contained in:
Christopher Baines 2023-03-09 08:29:39 +00:00
parent 55059558e7
commit e39c9da028
7 changed files with 299 additions and 2 deletions

View File

@ -1595,7 +1595,13 @@ WHERE job_id = $1")
package-derivation-ids)
(simple-format
#t "Successfully loaded ~A package/derivation pairs\n"
ids-count))))
ids-count))
(with-time-logging
"insert-guix-revision-package-derivation-distribution-counts"
(insert-guix-revision-package-derivation-distribution-counts
conn
guix-revision-id))))
#t)
(lambda (key . args)
(simple-format (current-error-port)

View File

@ -16,8 +16,16 @@
;;; <http://www.gnu.org/licenses/>.
(define-module (guix-data-service model guix-revision-package-derivation)
#:use-module (ice-9 match)
#:use-module (ice-9 threads)
#:use-module (squee)
#:export (insert-guix-revision-package-derivations))
#:use-module (guix-data-service database)
#:export (insert-guix-revision-package-derivations
insert-guix-revision-package-derivation-distribution-counts
backfill-guix-revision-package-derivation-distribution-counts
get-sql-to-select-package-and-related-derivations-for-revision))
(define (insert-guix-revision-package-derivations
conn guix-revision-id package-derivation-ids)
@ -35,3 +43,250 @@
";"))
(exec-query conn insert))
(define (insert-guix-revision-package-derivation-distribution-counts
conn
guix-revision-id)
(define system-ids-and-targets
(exec-query
conn
"
SELECT DISTINCT system_id, target
FROM package_derivations
INNER JOIN guix_revision_package_derivations
ON package_derivations.id = guix_revision_package_derivations.package_derivation_id
WHERE revision_id = $1"
(list guix-revision-id)))
(define (get-count-for-next-level system target level-counts)
(define next-level
(length level-counts))
(define query
(string-append
(simple-format
#f
"
WITH l0 AS (
SELECT derivation_id
FROM package_derivations
INNER JOIN guix_revision_package_derivations
ON package_derivations.id
= guix_revision_package_derivations.package_derivation_id
WHERE revision_id = 19411
AND system_id = 4
AND target = $STR$$STR$~A
)"
(if (= next-level 0)
""
(simple-format
#f
"
LIMIT ~A"
(car level-counts))))
(if (= next-level 0)
""
(string-join
(map
(match-lambda*
((level count)
(simple-format
#f
", l~A AS (
(
SELECT derivation_outputs.derivation_id
FROM derivation_outputs WHERE derivation_outputs.id IN (
SELECT DISTINCT derivation_inputs.derivation_output_id
FROM l~A
INNER JOIN derivation_inputs
ON l~A.derivation_id = derivation_inputs.derivation_id
)
) EXCEPT (~A
)~A
)"
level
(- level 1)
(- level 1)
(string-join
(map
(lambda (level)
(simple-format
#f
" SELECT derivation_id FROM l~A"
level))
(iota level))
"
UNION ALL")
(if count
(simple-format
#f
"
LIMIT ~A"
count)
""))))
(iota (length level-counts) 1)
(append (cdr level-counts) '(#f)))
""))
(simple-format
#f
"
SELECT COUNT(*) FROM l~A"
(length level-counts))))
(string->number
(caar
(exec-query
conn
query))))
(define (insert-level-count system-id target level count)
(exec-query
conn
"
INSERT INTO guix_revision_package_derivation_distribution_counts
VALUES ($1, $2, $3, $4, $5)"
(list guix-revision-id
system-id
target
(number->string level)
(number->string count))))
(for-each
(match-lambda
((system-id target)
(let loop ((level-counts '()))
(let ((level (length level-counts))
(count (get-count-for-next-level system-id target level-counts)))
(unless (= count 0)
(insert-level-count system-id target level count)
(loop (append level-counts (list count))))))))
system-ids-and-targets))
(define (backfill-guix-revision-package-derivation-distribution-counts)
(define revision-ids
(with-thread-postgresql-connection
(lambda (conn)
(map
car
(exec-query
conn
"
SELECT id
FROM guix_revisions
EXCEPT
SELECT guix_revision_id
FROM guix_revision_package_derivation_distribution_counts
ORDER BY id DESC")))))
(n-par-for-each
4
(lambda (revision-id)
(simple-format #t "backfilling guix_revision_package_derivation_distribution_counts for revision ~A\n" revision-id)
(with-thread-postgresql-connection
(lambda (conn)
(with-postgresql-transaction
conn
(lambda (conn)
(insert-guix-revision-package-derivation-distribution-counts
conn
revision-id))))))
revision-ids))
(define* (get-sql-to-select-package-and-related-derivations-for-revision
conn
guix-revision-id
#:key system-id target)
(define level-counts
(map
(match-lambda
((level count)
(list
(string->number level)
(string->number count))))
(exec-query
conn
"
SELECT level, distinct_derivations
FROM guix_revision_package_derivation_distribution_counts
WHERE guix_revision_id = $1
AND system_id = $2
AND target = $3
ORDER BY level ASC"
(list guix-revision-id
(number->string system-id)
target))))
(define (query level-counts)
(string-append
(simple-format
#f
"
WITH l0 AS (
SELECT derivation_id
FROM package_derivations
INNER JOIN guix_revision_package_derivations
ON package_derivations.id
= guix_revision_package_derivations.package_derivation_id
WHERE revision_id = ~A
AND system_id = ~A
AND target = $STR$~A$STR$
LIMIT ~A
)"
guix-revision-id
system-id
target
(cdr (car level-counts)))
(string-join
(map
(match-lambda*
((level count)
(simple-format
#f
", l~A AS (
(
SELECT derivation_outputs.derivation_id
FROM derivation_outputs WHERE derivation_outputs.id IN (
SELECT DISTINCT derivation_inputs.derivation_output_id
FROM l~A
INNER JOIN derivation_inputs
ON l~A.derivation_id = derivation_inputs.derivation_id
)
) EXCEPT (~A
)~A
)"
level
(- level 1)
(- level 1)
(string-join
(map
(lambda (level)
(simple-format
#f
" SELECT derivation_id FROM l~A"
level))
(iota level))
"
UNION ALL")
(simple-format
#f
"
LIMIT ~A"
count))))
(iota (- (length level-counts) 1) 1)
(cdr (map cdr level-counts)))
"")
", all_derivations AS (
SELECT *
FROM l0"
(string-join
(map (lambda (level)
(simple-format #f " UNION (SELECT * FROM l~A)" level))
(iota (- (length level-counts) 1) 1))
"\n")
"
)"))
(if level-counts
(query level-counts)
#f))

View File

@ -37,6 +37,7 @@
(guix-data-service config)
(guix-data-service database)
(guix-data-service substitutes)
(guix-data-service model guix-revision-package-derivation)
(guix-data-service web server)
(guix-data-service web controller)
(guix-data-service web nar controller))
@ -217,6 +218,13 @@
(pid-file (assq-ref opts 'pid-file)))
(call-with-new-thread
(lambda ()
(with-postgresql-connection-per-thread
"backfill"
(lambda ()
(backfill-guix-revision-package-derivation-distribution-counts)))))
(when pid-file
(call-with-output-file pid-file
(lambda (port)

View File

@ -0,0 +1,13 @@
-- Deploy guix-data-service:guix_revision_package_derivation_distribution_counts to pg
BEGIN;
CREATE TABLE guix_revision_package_derivation_distribution_counts (
guix_revision_id integer NOT NULL REFERENCES guix_revisions (id),
system_id integer NOT NULL REFERENCES systems (id),
target varchar NOT NULL,
level integer NOT NULL,
distinct_derivations integer NOT NULL
);
COMMIT;

View File

@ -0,0 +1,7 @@
-- Revert guix-data-service:guix_revision_package_derivation_distribution_counts from pg
BEGIN;
-- XXX Add DDLs here.
COMMIT;

View File

@ -93,3 +93,4 @@ blocked_builds 2022-11-07T11:27:28Z Chris <chris@felis> # Add blocked_builds
package_derivations_extended_statistics 2022-11-12T10:40:18Z Chris <chris@felis> # Add extended statistics on package_derivations
derivation_outputs_id_and_derivation_id_idx 2022-11-12T10:41:42Z Chris <chris@felis> # Add index on derivation_outputs id and derivation_id
blocked_builds_blocked_builds_blocked_derivation_output_details_set_id_2 2023-03-05T10:19:53Z Chris <chris@felis> # Add index on blocked_builds_blocked_derivation_output_details_set_id
guix_revision_package_derivation_distribution_counts 2023-03-08T16:53:44Z Chris <chris@felis> # Add guix_revision_package_derivation_distribution_counts table

View File

@ -0,0 +1,7 @@
-- Verify guix-data-service:guix_revision_package_derivation_distribution_counts on pg
BEGIN;
-- XXX Add verifications here.
ROLLBACK;