Skip to content

Commit

Permalink
Merge pull request #133 from ecmwf-projects/clean-old-licences
Browse files Browse the repository at this point in the history
bugfix: old revisions of licences now are removed
  • Loading branch information
alex75 authored Sep 25, 2024
2 parents 18248f5 + e5de8c0 commit 69c4812
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 19 deletions.
2 changes: 1 addition & 1 deletion cads_catalogue/entry_points.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def update_catalogue(
resources_folder_path,
cim_folder_path,
storage_settings,
force=force,
force=force or licences_processed,
include=include,
exclude=exclude,
override_md=current_override_md,
Expand Down
42 changes: 24 additions & 18 deletions cads_catalogue/licence_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def update_catalogue_licences(
session: sa.orm.session.Session,
licences_folder_path: str,
storage_settings: config.ObjectStorageSettings,
) -> List[str]:
) -> List[tuple]:
"""
Load metadata of licences from files and sync each licence in the db.
Expand All @@ -253,27 +253,32 @@ def update_catalogue_licences(
Returns
-------
list: list of licence uids involved
list: list of tuple (licence uid, revision) of licences involved
"""
involved_licence_uids = []
involved_licences = []
licences = load_licences_from_folder(licences_folder_path)
logger.info("loaded %s licences from %s" % (len(licences), licences_folder_path))
for licence in licences:
licence_uid = licence["licence_uid"]
involved_licence_uids.append(licence_uid)
revision = int(licence["revision"])
involved_licences.append((licence_uid, revision))
try:
with session.begin_nested():
licence_sync(session, licence_uid, licences, storage_settings)
logger.info("licence '%s' db sync successful" % licence_uid)
logger.info(
"licence '%s' (revision %s): db sync successful"
% (licence_uid, revision)
)
except Exception: # noqa
logger.exception(
"db sync for licence '%s' failed, error follows" % licence_uid
"db sync for licence '%s' (revision %s) failed, error follows"
% (licence_uid, revision)
)
return involved_licence_uids
return involved_licences


def remove_orphan_licences(
session: sa.orm.session.Session, keep_licences: List[str], resources: List[str]
session: sa.orm.session.Session, keep_licences: List[tuple], resources: List[str]
):
"""
Remove all licences that not are in the list of `keep_licences` and unrelated to any resource.
Expand All @@ -284,18 +289,19 @@ def remove_orphan_licences(
keep_licences: list of licence uids to keep
resources: list of resource_uid
"""
licences_to_delete = session.scalars(
sa.select(database.Licence).filter(
database.Licence.licence_uid.notin_(keep_licences)
)
)
for licence_to_delete in licences_to_delete:
related_dataset_uids = [r.resource_uid for r in licence_to_delete.resources] # type: ignore
all_licences = session.scalars(sa.select(database.Licence))
for licence in all_licences:
if (licence.licence_uid, licence.revision) in keep_licences:
continue
related_dataset_uids = [r.resource_uid for r in licence.resources]
if set(related_dataset_uids).intersection(set(resources)):
continue
licence_to_delete.resources = [] # type: ignore
session.delete(licence_to_delete)
logger.info("removed licence '%s'" % licence_to_delete.licence_uid)
licence.resources = []
session.delete(licence)
logger.info(
"removed licence '%s' (revision %s)"
% (licence.licence_uid, licence.revision)
)


def migrate_from_cds_licences(
Expand Down
161 changes: 161 additions & 0 deletions tests/test_20_licence_manager.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import operator
import os.path
import uuid
from typing import Any

import pytest_mock
import sqlalchemy as sa
Expand All @@ -10,6 +12,41 @@
TESTDATA_PATH = os.path.join(THIS_PATH, "data")


def mock_dataset(
resource_uid: str | None = None,
abstract: str = "abstract",
description: dict[str, Any] | None = None,
**kwargs,
) -> database.Resource:
dataset = database.Resource(
resource_uid=resource_uid or str(uuid.uuid4()),
abstract=abstract,
description=description or dict(),
type="dataset",
**kwargs,
)
return dataset


def mock_licence(
licence_uid: str | None = None,
revision: int = 1,
title: str = "title",
download_filename: str = "licence.pdf",
md_filename: str = "licence.md",
**kwargs,
) -> database.Licence:
licence = database.Licence(
licence_uid=licence_uid or str(uuid.uuid4()),
revision=revision,
title=title,
download_filename=download_filename,
md_filename=md_filename,
**kwargs,
)
return licence


def test_licence_sync(
session_obj: sa.orm.sessionmaker, mocker: pytest_mock.MockerFixture
) -> None:
Expand Down Expand Up @@ -156,3 +193,127 @@ def test_load_licences_from_folder() -> None:
)

assert licences == expected_licences


def test_update_catalogue_licences(
session_obj: sa.orm.sessionmaker, mocker: pytest_mock.MockerFixture
) -> None:
# load and add some licences into the db
licences_folder_path = os.path.join(TESTDATA_PATH, "cads-licences")
my_settings_dict = {
"object_storage_url": "object/storage/url",
"storage_admin": "admin1",
"storage_password": "secret1",
"catalogue_bucket": "mycatalogue_bucket",
"document_storage_url": "my/url",
}
storage_settings = config.ObjectStorageSettings(**my_settings_dict)
_ = mocker.patch(
"cads_catalogue.object_storage.store_file",
return_value="an url",
)
with session_obj() as session:
licence_attrs = licence_manager.update_catalogue_licences(
session, licences_folder_path, storage_settings
)
assert sorted(licence_attrs) == [
("CCI-data-policy-for-satellite-surface-radiation-budget", 4),
("data-protection-privacy-statement", 24),
("eumetsat-cm-saf", 1),
("licence-to-use-copernicus-products", 12),
]
session.commit()
for licence_uid, revision in licence_attrs:
licence_obj = session.scalars(
sa.select(database.Licence).filter_by(
licence_uid=licence_uid, revision=revision
)
).all()
assert len(licence_obj) == 1


def test_remove_orphan_licences(session_obj: sa.orm.sessionmaker) -> None:
resource_uids = ["datasetA", "datasetB", "datasetC"]
licence_uids = [("licence_1", 1), ("licence_2", 1), ("licence_3", 1)]
dataset_objs = dict()
licence_objs = dict()
with session_obj() as session:
# add some datasets
for resource_uid in resource_uids:
dataset_obj = mock_dataset(resource_uid=resource_uid)
session.add(dataset_obj)
dataset_objs[resource_uid] = dataset_obj
session.commit()
# add some licences
for licence_uid, revision in licence_uids:
licence_obj = mock_licence(licence_uid=licence_uid, revision=revision)
session.add(licence_obj)
licence_objs[licence_uid] = licence_obj
# add some relationships
licence_objs["licence_1"].resources = [dataset_objs["datasetA"]]
licence_objs["licence_2"].resources = [
dataset_objs["datasetA"],
dataset_objs["datasetB"],
]
licence_objs["licence_3"].resources = [dataset_objs["datasetC"]]
session.commit()

# case 1: do not remove anything, all licences are to keep
keep_licences = licence_uids
licence_manager.remove_orphan_licences(session, keep_licences, resource_uids)
session.commit()
for licence_uid, revision in licence_uids:
query_licences = session.scalars(
sa.select(database.Licence).filter_by(
licence_uid=licence_uid, revision=revision
)
).all()
assert len(query_licences) == 1

# case 2: do not remove anything, not to keep but they all have datasets
keep_licences = []
licence_manager.remove_orphan_licences(session, keep_licences, resource_uids)
session.commit()
for licence_uid, revision in licence_uids:
query_licences = session.scalars(
sa.select(database.Licence).filter_by(
licence_uid=licence_uid, revision=revision
)
).all()
assert len(query_licences) == 1

# case 3: remove a licence, not to keep and unrelated to any dataset
keep_licences = [
("licence_1", 1),
("licence_2", 1),
]
licence_objs["licence_3"].resources = []
licence_manager.remove_orphan_licences(session, keep_licences, resource_uids)
session.commit()
for licence_uid, revision in keep_licences:
query_licences = session.scalars(
sa.select(database.Licence).filter_by(
licence_uid=licence_uid, revision=revision
)
).all()
assert len(query_licences) == 1
query_licences = session.scalars(
sa.select(database.Licence).filter_by(
licence_uid="licence_3", revision=1
)
).all()
assert len(query_licences) == 0

# case 4: remove a licence, not to keep and related to dataset not to keep
keep_licences = []
resource_uids = ["datasetB"]
licence_manager.remove_orphan_licences(session, keep_licences, resource_uids)
session.commit()
query_licences = session.scalars(
sa.select(database.Licence).filter_by(licence_uid="licence_2", revision=1)
).all()
assert len(query_licences) == 1
query_licences = session.scalars(
sa.select(database.Licence).filter_by(licence_uid="licence_1", revision=1)
).all()
assert len(query_licences) == 0

0 comments on commit 69c4812

Please sign in to comment.