2
0
Fork 0
mirror of git://git.savannah.gnu.org/guix/data-service.git synced 2023-12-14 03:23:03 +01:00

Fix some duplicated values in tables

The licenses table, along with the package_metadata table had duplicate
values. This could happen as the unique constraints on those tables didn't
properly account for the nullable fields.

The duplicates in those tables also affected the license_sets, packages,
package_derivations tables in a similar way. Finally, the
guix_revision_package_derivations table was also affected.

This commit adds a migration to fix the data, as well as the constraints. THe
code to populate the licenses and package_metadata tables is also updated.
This commit is contained in:
Christopher Baines 2019-08-04 09:39:40 +01:00
parent 1724bc485f
commit 84197686ce
10 changed files with 313 additions and 38 deletions

View file

@ -43,28 +43,6 @@
(inferior-eval '(use-modules (guix licenses)) inf)
(inferior-eval (proc packages) inf))
(define (select-licenses license-values)
(string-append
"SELECT id, licenses.name, licenses.uri, licenses.comment "
"FROM licenses "
"JOIN (VALUES "
(string-join
(map (lambda (values)
(string-append
"("
(string-join
(map value->quoted-string-or-null
values)
", ")
")"))
license-values)
", ")
") AS vals (name, uri, comment) "
"ON "
"licenses.name = vals.name AND "
"licenses.uri = vals.uri AND "
"licenses.comment = vals.comment"))
(define (insert-licenses values)
(string-append
"INSERT INTO licenses "
@ -91,15 +69,39 @@
(map number->string
(sort (map string->number ids) <)))
(define (non-string-to-false lst)
(map (lambda (value)
(if (string? value)
value
#f))
lst))
(define (empty-string-to-false lst)
;; TODO squee returns empty strings for null values, which will probably
;; cause problems
(map (lambda (value)
(if (string? value)
(if (string-null? value)
#f
value)
value))
lst))
(let* ((unique-license-tuples
(filter (lambda (license-tuple)
(not (null? license-tuple)))
(delete-duplicates
(concatenate license-data))))
(map
(lambda (lst)
(non-string-to-false
(empty-string-to-false lst)))
(concatenate license-data)))))
(existing-license-entries
(exec-query->vhash conn
(select-licenses unique-license-tuples)
cdr
"SELECT id, name, uri, comment FROM licenses"
(lambda (vals)
(non-string-to-false
(empty-string-to-false (cdr vals))))
first)) ;; id
(missing-license-entries
(delete-duplicates
@ -128,5 +130,8 @@
(begin
(error "missing license entry"
license-values)))))
license-value-lists)))
(map (lambda (lst)
(non-string-to-false
(empty-string-to-false lst)))
license-value-lists))))
license-data)))

View file

@ -52,8 +52,14 @@
"ON "
(string-join
(map (lambda (field)
(string-append
"package_metadata." field " = vals." field))
(if (member field '("home_page" "location_id"
"license_set_id"))
(string-append
"(package_metadata." field " = vals." field " OR "
"(package_metadata." field " IS NULL AND"
" vals." field " IS NULL))")
(string-append
"package_metadata." field " = vals." field)))
fields)
" AND ")))
@ -129,7 +135,8 @@ WHERE packages.id IN (
(map (lambda (package license-set-id)
(list (inferior-package-synopsis package)
(inferior-package-description package)
(inferior-package-home-page package)
(non-empty-string-or-false
(inferior-package-home-page package))
(location->location-id
conn
(inferior-package-location package))
@ -140,7 +147,14 @@ WHERE packages.id IN (
(let* ((existing-package-metadata-entries
(exec-query->vhash conn
(select-package-metadata package-metadata)
cdr
(match-lambda
((id synopsis description home-page
location-id license-set-id)
(list synopsis
description
(non-empty-string-or-false home-page)
location-id
license-set-id)))
first)) ;; id))
(missing-package-metadata-entries
(delete-duplicates

View file

@ -5,6 +5,7 @@
#:use-module (squee)
#:export (quote-string
value->quoted-string-or-null
non-empty-string-or-false
exec-query->vhash
two-lists->vhash
deduplicate-strings
@ -18,6 +19,13 @@
(string-append "$STR$" value "$STR$")
"NULL"))
(define (non-empty-string-or-false s)
(if (string? s)
(if (string-null? s)
#f
s)
#f))
(define (exec-query->vhash conn query field-function value-function)
(fold (lambda (row result)
(vhash-cons (field-function row)

View file

@ -0,0 +1,218 @@
-- Deploy guix-data-service:fix_duplicated_licenses to pg
BEGIN;
SET CONSTRAINTS ALL DEFERRED;
-- Remove unique constraint from license_sets
ALTER TABLE license_sets DROP CONSTRAINT license_sets_pkey;
-- Change all license sets to refer to canonical licenses
UPDATE license_sets AS master SET license_ids = ARRAY(
SELECT new_licenses_2.id FROM (
SELECT a.elem AS id, a.nr AS index
FROM license_sets, unnest(license_sets.license_ids)
WITH ORDINALITY a(elem, nr)
WHERE id = master.id
) AS old_licenses
INNER JOIN (
SELECT licenses.id AS old_id, new_licenses.*
FROM licenses INNER JOIN (
SELECT MIN(id) AS id, name, uri, comment
FROM licenses
GROUP BY name, uri, comment
ORDER BY name
) AS new_licenses
ON licenses.name = new_licenses.name AND
(licenses.uri = new_licenses.uri OR (
licenses.uri IS NULL AND new_licenses.uri IS NULL
)) AND (
licenses.comment = new_licenses.comment OR (
licenses.comment IS NULL AND new_licenses.comment IS NULL
))
) AS new_licenses_2
ON old_licenses.id = new_licenses_2.old_id
ORDER BY old_licenses.index);
-- Remove unique constraint from package_metadata
ALTER TABLE package_metadata DROP CONSTRAINT package_metadata_unique_fields;
-- Update package_metadata to refer to canonical license_sets
WITH data AS (
SELECT MIN(id) AS id, ARRAY_AGG(id) AS old_ids
FROM license_sets
GROUP BY license_ids
)
UPDATE package_metadata AS master
SET license_set_id = data.id
FROM data
WHERE license_set_id = ANY(data.old_ids);
-- Remove unique constraint from packages
ALTER TABLE packages DROP CONSTRAINT packages_pkey;
-- Update packages to refer to canonical package_metadata entries
WITH data AS (
SELECT MIN(package_metadata.id) AS id, ARRAY_AGG(package_metadata.id) AS old_ids
FROM package_metadata
GROUP BY package_metadata.synopsis, package_metadata.description,
package_metadata.home_page, package_metadata.location_id,
package_metadata.license_set_id
HAVING COUNT(package_metadata.id) > 1
)
UPDATE packages SET package_metadata_id = data.id
FROM data
WHERE package_metadata_id = ANY(data.old_ids);
-- Remove unique constraint from package_derivations
ALTER TABLE package_derivations DROP CONSTRAINT package_derivations_pkey;
-- Update package_derivations to refer to canonical packages entries
WITH data AS (
SELECT unnest(old_ids) AS old, id FROM (
SELECT MIN(packages.id) AS id, ARRAY_AGG(packages.id) AS old_ids
FROM packages
GROUP BY name, version, package_metadata_id
HAVING COUNT(id) > 1
) AS d2
)
UPDATE package_derivations SET package_id = data.id
FROM data
WHERE package_id = data.old;
-- Update guix_revision_package_derivations to refer to canonical
-- package_derivations entries
WITH data AS (
SELECT unnest(old_ids) AS old, id FROM (
SELECT MIN(package_derivations.id) AS id, ARRAY_AGG(package_derivations.id) AS old_ids
FROM package_derivations
GROUP BY package_id, derivation_id, system, target
HAVING COUNT(id) > 1
) AS d2
)
UPDATE guix_revision_package_derivations SET package_derivation_id = data.id
FROM data
WHERE package_derivation_id = data.old;
-- Drop the foreign key constraint as an attempt to speed up deleting from
-- package_derivations.
ALTER TABLE guix_revision_package_derivations
DROP CONSTRAINT guix_revision_package_derivations_package_derivation_id_fkey;
-- Delete non-canonical package_dervations entries
DELETE FROM package_derivations AS pd WHERE id NOT IN (
SELECT MIN(id)
FROM package_derivations
GROUP BY (
package_id,
derivation_id,
system,
target
)
);
-- Reinstate the deleted constraint
ALTER TABLE guix_revision_package_derivations
ADD CONSTRAINT guix_revision_package_derivations_package_derivation_id_fkey
FOREIGN KEY (package_derivation_id) REFERENCES package_derivations(id);
-- Delete non-canonical packages entries
DELETE FROM packages AS p WHERE id NOT IN (
SELECT MIN(id)
FROM packages
GROUP BY (name, version, package_metadata_id)
);
-- Add referential constraints
ALTER TABLE package_derivations
ADD CONSTRAINT package_derivations_package_id_fkey
FOREIGN KEY (package_id) REFERENCES packages (id);
ALTER TABLE package_derivations
ADD CONSTRAINT package_derivations_derivation_id_fkey
FOREIGN KEY (derivation_id) REFERENCES derivations (id);
-- Delete non-canonical package_metadata entries
ALTER TABLE packages DROP CONSTRAINT package_metadata_id;
DELETE FROM package_metadata AS pm WHERE id NOT IN (
SELECT MIN(id)
FROM package_metadata
GROUP BY (synopsis, description, home_page, location_id, license_set_id)
);
ALTER TABLE packages ADD CONSTRAINT package_metadata_id
FOREIGN KEY (package_metadata_id) REFERENCES package_metadata(id);
-- Delete non-canonical license_sets entries
ALTER TABLE package_metadata DROP CONSTRAINT package_metadata_license_set_id_fkey;
DELETE FROM license_sets AS ls WHERE id NOT IN (
SELECT MIN(id)
FROM license_sets
GROUP BY license_ids
);
ALTER TABLE package_metadata ADD CONSTRAINT package_metadata_license_set_id_fkey
FOREIGN KEY (license_set_id) REFERENCES license_sets(id);
-- Delete non-canonical licenses entries
DELETE FROM licenses AS l WHERE id NOT IN (
SELECT MIN(id)
FROM licenses
GROUP BY (name, uri, comment)
);
-- Restore unique constraints
CREATE UNIQUE INDEX ON licenses (name)
WHERE uri IS NULL AND comment IS NULL;
CREATE UNIQUE INDEX ON licenses (name, uri)
WHERE uri IS NOT NULL AND comment IS NULL;
CREATE UNIQUE INDEX ON licenses (name, comment)
WHERE uri IS NULL AND comment IS NOT NULL;
CREATE UNIQUE INDEX ON licenses (name, uri, comment)
WHERE uri IS NOT NULL AND comment IS NOT NULL;
ALTER TABLE license_sets ADD PRIMARY KEY (license_ids);
ALTER TABLE package_metadata ALTER synopsis SET NOT NULL;
ALTER TABLE package_metadata ALTER description SET NOT NULL;
CREATE UNIQUE INDEX ON package_metadata (
synopsis,
description,
coalesce(location_id, -1),
coalesce(license_set_id, -1)
) WHERE home_page IS NULL;
CREATE UNIQUE INDEX ON package_metadata (
synopsis,
description,
home_page,
coalesce(location_id, -1),
coalesce(license_set_id, -1)
) WHERE home_page IS NOT NULL;
ALTER TABLE packages ADD PRIMARY KEY (name, version, package_metadata_id);
ALTER TABLE package_derivations ADD PRIMARY KEY (package_id, derivation_id, system, target);
COMMIT;

View file

@ -0,0 +1,7 @@
-- Revert guix-data-service:fix_duplicated_licenses from pg
BEGIN;
-- XXX Add DDLs here.
COMMIT;

View file

@ -15,3 +15,4 @@ dates_to_load_new_guix_revision_jobs 2019-06-02T07:39:49Z Christopher Baines <ma
load_new_guix_revision_job_events 2019-06-02T15:44:41Z Christopher Baines <mail@cbaines.net> # Add new table for guix_revision_job_events
load_new_guix_revision_job_logs 2019-06-21T14:33:09Z chris <chris@phact> # Add load_new_guix_revision_job_logs
change_load_new_guix_revision_job_logs_contents_to_be_nullable 2019-07-07T20:10:54Z Christopher Baines <mail@cbaines.net> # Change the contents field in the load_new_guix_revision_job_logs table\nto be nullable.\n\nwith '#' will # be ignored, and an empty message aborts the add. #\nChange to add: # #\nchange_load_new_guix_revision_job_logs_contents_to_be_nullable #\nsqitch/deploy/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql\nsqitch/revert/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql\nsqitch/verify/change_load_new_guix_revision_job_logs_contents_to_be_nullable.sql
fix_duplicated_licenses 2019-07-30T05:48:17Z Christopher Baines <mail@cbaines.net> # Fix duplicated licenses, and add constraints

View file

@ -0,0 +1,7 @@
-- Verify guix-data-service:fix_duplicated_licenses on pg
BEGIN;
-- XXX Add verifications here.
ROLLBACK;

View file

@ -19,10 +19,10 @@
"https://example.com/why-license-1"))
(("License 1"
"https://gnu.org/licenses/test-1.html"
"https://example.com/why-license-1")
#f)
("License 2"
"https://gnu.org/licenses/test-2.html"
"https://example.com/why-license-2")))))
#f
#f)))))
(with-postgresql-connection
"test-model-license-set"

View file

@ -17,10 +17,13 @@
"https://example.com/why-license-1"))
(("License 1"
"https://gnu.org/licenses/test-1.html"
"https://example.com/why-license-1")
#f)
("License 2"
"https://gnu.org/licenses/test-2.html"
"https://example.com/why-license-2")))))
#f)
("License 3"
#f
#f)))))
(with-postgresql-connection
"test-model-license"

View file

@ -18,6 +18,15 @@
(home-page "https://example.com")
(location (location "file.scm" 5 0))))
(define mock-inferior-package-foo-2
(mock-inferior-package
(name "foo")
(version "2")
(synopsis "Foo")
(description "Foo description")
(home-page #f)
(location #f)))
(define (test-license-set-ids conn)
(mock
((guix-data-service model license)
@ -46,7 +55,8 @@
(match
(inferior-packages->package-metadata-ids
conn
(list mock-inferior-package-foo)
(list mock-inferior-package-foo
mock-inferior-package-foo-2)
(test-license-set-ids conn))
((x) (string? x))))
#:always-rollback? #t))
@ -57,11 +67,13 @@
(test-equal "inferior-packages->package-metadata-ids"
(inferior-packages->package-metadata-ids
conn
(list mock-inferior-package-foo)
(list mock-inferior-package-foo
mock-inferior-package-foo-2)
(test-license-set-ids conn))
(inferior-packages->package-metadata-ids
conn
(list mock-inferior-package-foo)
(list mock-inferior-package-foo
mock-inferior-package-foo-2)
(test-license-set-ids conn)))
#:always-rollback? #t))))))