diff --git a/CHANGES/1099.feature b/CHANGES/1099.feature new file mode 100644 index 00000000..1699af0e --- /dev/null +++ b/CHANGES/1099.feature @@ -0,0 +1 @@ +Added support for recreating and fixing metadata files to `repair_metadata` endpoint. diff --git a/pulp_python/app/tasks/repair.py b/pulp_python/app/tasks/repair.py index c2cfc0d0..bcc519dc 100644 --- a/pulp_python/app/tasks/repair.py +++ b/pulp_python/app/tasks/repair.py @@ -8,11 +8,12 @@ from django.db.models.query import QuerySet from pulp_python.app.models import PythonPackageContent, PythonRepository from pulp_python.app.utils import ( + artifact_to_metadata_artifact, artifact_to_python_content_data, fetch_json_release_metadata, parse_metadata, ) -from pulpcore.plugin.models import ContentArtifact, ProgressReport +from pulpcore.plugin.models import Artifact, ContentArtifact, Domain, ProgressReport from pulpcore.plugin.util import get_domain log = logging.getLogger(__name__) @@ -41,16 +42,25 @@ def repair(repository_pk: UUID) -> None: content_set = repository.latest_version().content.values_list("pk", flat=True) content = PythonPackageContent.objects.filter(pk__in=content_set) - num_repaired, pkgs_not_repaired = repair_metadata(content) + num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired = ( + repair_metadata(content) + ) + # Convert set() to 0 + if not pkgs_not_repaired: + pkgs_not_repaired = 0 + if not pkgs_metadata_not_repaired: + pkgs_metadata_not_repaired = 0 + log.info( _( "{} packages' metadata repaired. Not repaired packages due to either " - "inaccessible URL or mismatched sha256: {}." - ).format(num_repaired, pkgs_not_repaired) + "inaccessible URL or mismatched sha256: {}. " + "{} metadata files repaired. Packages whose metadata files could not be repaired: {}." + ).format(num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired) ) -def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]: +def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str], int, set[str]]: """ Repairs metadata for a queryset of PythonPackageContent objects and updates the progress report. @@ -59,9 +69,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s content (QuerySet[PythonPackageContent]): The queryset of items to repair. Returns: - tuple[int, set[str]]: A tuple containing: + tuple[int, set[str], int, set[str]]: A tuple containing: - The number of packages that were repaired. - A set of packages' PKs that were not repaired. + - The number of metadata files that were repaired. + - A set of packages' PKs without repaired metadata artifacts. """ immediate_content = ( content.filter(contentartifact__artifact__isnull=False) @@ -87,6 +99,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s # Keep track of on-demand packages that were not repaired pkgs_not_repaired = set() + # Metadata artifacts and content artifacts + metadata_batch = [] + total_metadata_repaired = 0 + pkgs_metadata_not_repaired = set() + progress_report = ProgressReport( message="Repairing packages' metadata", code="repair.metadata", @@ -102,6 +119,14 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s .artifact ) new_data = artifact_to_python_content_data(package.filename, main_artifact, domain) + total_metadata_repaired += update_metadata_artifact_if_needed( + package, + new_data.get("metadata_sha256"), + main_artifact, + domain, + metadata_batch, + pkgs_metadata_not_repaired, + ) total_repaired += update_package_if_needed( package, new_data, batch, set_of_update_fields ) @@ -163,7 +188,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s total_repaired += len(batch) PythonPackageContent.objects.bulk_update(batch, set_of_update_fields) - return total_repaired, pkgs_not_repaired + if metadata_batch: + not_repaired = _process_metadata_batch(metadata_batch) + pkgs_metadata_not_repaired.update(not_repaired) + total_metadata_repaired += len(metadata_batch) - len(not_repaired) + + return total_repaired, pkgs_not_repaired, total_metadata_repaired, pkgs_metadata_not_repaired def update_package_if_needed( @@ -202,3 +232,95 @@ def update_package_if_needed( set_of_update_fields.clear() return total_repaired + + +def update_metadata_artifact_if_needed( + package: PythonPackageContent, + new_metadata_sha256: str | None, + main_artifact: Artifact, + domain: Domain, + metadata_batch: list[tuple], + pkgs_metadata_not_repaired: set[str], +) -> int: + """ + Repairs metadata artifacts for wheel packages by creating missing metadata artifacts + or updating existing ones when the metadata_sha256 differs. Only processes wheel files + that have a valid new_metadata_sha256. Queues operations for batch processing. + + Args: + package: Package to check for metadata changes. + new_metadata_sha256: The correct metadata_sha256 extracted from the main artifact, or None. + main_artifact: The main package artifact used to generate metadata. + domain: The domain in which the metadata artifact will be created. + metadata_batch: List of tuples for batch processing (updated in-place). + pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place). + + Returns: + Number of repaired metadata artifacts (only when batch is flushed at BULK_SIZE). + """ + total_metadata_repaired = 0 + + if not package.filename.endswith(".whl") or not new_metadata_sha256: + return total_metadata_repaired + + original_metadata_sha256 = package.metadata_sha256 + cas = package.contentartifact_set.filter(relative_path__endswith=".metadata") + + # Create missing + if not cas: + metadata_batch.append(("create", package, main_artifact, None, domain)) + # Fix existing + elif new_metadata_sha256 != original_metadata_sha256: + ca = cas.first() + metadata_artifact = ca.artifact + if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256): + metadata_batch.append(("update", package, main_artifact, ca, domain)) + + if len(metadata_batch) == BULK_SIZE: + not_repaired = _process_metadata_batch(metadata_batch) + pkgs_metadata_not_repaired.update(not_repaired) + total_metadata_repaired += BULK_SIZE - len(not_repaired) + metadata_batch.clear() + + return total_metadata_repaired + + +def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]: + """ + Processes a batch of metadata repair operations by creating metadata artifacts + and their corresponding ContentArtifacts. + + Args: + metadata_batch: List of (action, package, main_artifact, content_artifact, domain) tuples. + + Returns: + Set of package PKs for which metadata artifacts could not be created. + """ + not_repaired = set() + content_artifacts_to_create = [] + content_artifacts_to_update = [] + + for action, package, main_artifact, content_artifact, domain in metadata_batch: + metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact) + if metadata_artifact: + metadata_artifact.pulp_domain = domain + metadata_artifact.save() + if action == "create": + ca = ContentArtifact( + artifact=metadata_artifact, + content=package, + relative_path=f"{package.filename}.metadata", + ) + content_artifacts_to_create.append(ca) + elif action == "update": + content_artifact.artifact = metadata_artifact + content_artifacts_to_update.append(content_artifact) + else: + not_repaired.add(package.pk) + + if content_artifacts_to_create: + ContentArtifact.objects.bulk_create(content_artifacts_to_create) + if content_artifacts_to_update: + ContentArtifact.objects.bulk_update(content_artifacts_to_update, ["artifact"]) + + return not_repaired diff --git a/pulp_python/tests/functional/api/test_repair.py b/pulp_python/tests/functional/api/test_repair.py index ede9df46..102f868e 100644 --- a/pulp_python/tests/functional/api/test_repair.py +++ b/pulp_python/tests/functional/api/test_repair.py @@ -10,7 +10,7 @@ @pytest.fixture def create_content_direct(python_bindings): - def _create(artifact_filename, content_data): + def _create(artifact_filename, content_data, metadata_artifact_filename=None): commands = ( "from pulpcore.plugin.models import Artifact, ContentArtifact; " "from pulpcore.plugin.util import get_url; " @@ -21,8 +21,16 @@ def _create(artifact_filename, content_data): "c.save(); " f"ca = ContentArtifact(artifact=a, content=c, relative_path=c.filename); " "ca.save(); " - "print(get_url(c))" ) + if metadata_artifact_filename: + commands += ( + f"a2 = Artifact.init_and_validate('{metadata_artifact_filename}'); " + "a2.save(); " + "ca2_filename = c.filename + '.metadata'; " + f"ca2 = ContentArtifact(artifact=a2, content=c, relative_path=ca2_filename); " + "ca2.save(); " + ) + commands += "print(get_url(c))" process = subprocess.run(["pulpcore-manager", "shell", "-c", commands], capture_output=True) assert process.returncode == 0 @@ -214,3 +222,112 @@ def test_metadata_repair_endpoint( assert new_content.author == author assert new_content.packagetype == packagetype assert new_content.requires_python == requires_python + + +def test_metadata_artifact_repair_endpoint( + create_content_direct, + delete_orphans_pre, + download_python_file, + monitor_task, + move_to_repository, + pulpcore_bindings, + python_bindings, + python_repo_factory, +): + """ + Test repairing of PythonPackageContent's metadata_sha256 and its metadata Artifact + and ContentArtifact via `Repositories.repair_metadata` endpoint. + """ + # 1. Setup tested data + python_repo = python_repo_factory() + + # missing metadata_sha256, missing metadata Artifact + ContentArtifact + filename_1 = "scipy-1.1.0-cp27-none-win_amd64.whl" + metadata_1 = None + url_1 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_1) + file_1 = download_python_file(filename_1, url_1) + + # correct metadata_sha256, missing metadata Artifact + ContentArtifact + filename_2 = "scipy-1.1.0-cp27-cp27m-manylinux1_x86_64.whl" + metadata_2 = "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6" + url_2 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_2) + file_2 = download_python_file(filename_2, url_2) + + # wrong metadata_sha256, missing metadata Artifact + ContentArtifact + filename_3 = "scipy-1.1.0-cp34-none-win32.whl" + metadata_3 = "1234" + url_3 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_3) + file_3 = download_python_file(filename_3, url_3) + + # wrong metadata_sha256, wrong metadata Artifact, correct metadata ContentArtifact + filename_4 = "scipy-1.1.0-cp35-none-win32.whl" + metadata_4 = "5678" + url_4 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_4) + file_4 = download_python_file(filename_4, url_4) + metadata_file_4 = download_python_file( + f"{filename_1}.metadata", + urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), f"{filename_1}.metadata"), + ) + + # Build PythonPackageContent data + filenames = [filename_1, filename_2, filename_3, filename_4] + metadata_sha256s = [metadata_1, metadata_2, metadata_3, metadata_4] + data_1, data_2, data_3, data_4 = [ + {"name": "scipy", "version": "1.1.0", "filename": f, "metadata_sha256": m} + for f, m in zip(filenames, metadata_sha256s) + ] + + # 2. Create content + content_1 = create_content_direct(file_1, data_1) + content_2 = create_content_direct(file_2, data_2) + content_3 = create_content_direct(file_3, data_3) + content_4 = create_content_direct(file_4, data_4, metadata_file_4) + + content_hrefs = {} + for data, content in [ + (data_1, content_1), + (data_2, content_2), + (data_3, content_3), + (data_4, content_4), + ]: + for field, test_value in data.items(): + assert getattr(content, field) == test_value + content_hrefs[data["filename"]] = content.pulp_href + move_to_repository(python_repo.pulp_href, list(content_hrefs.values())) + + # 3. Repair metadata and metadata files + response = python_bindings.RepositoriesPythonApi.repair_metadata(python_repo.pulp_href) + monitor_task(response.task) + + # 4. Check new metadata and metadata files + main_artifact_hrefs = set() + metadata_artifact_hrefs = set() + new_data = [ + (filename_1, "15ae132303b2774a0d839d01c618cf99fc92716adfaaa2bc1267142ab2b76b98"), + (filename_2, "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"), + # filename_3 and filename_4 have the same metadata file + (filename_3, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"), + (filename_4, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"), + ] + for filename, metadata_sha256 in new_data: + content = pulpcore_bindings.ContentApi.list(pulp_href__in=[content_hrefs[filename]]).results + assert content + artifacts = content[0].artifacts + assert len(artifacts) == 2 + + main_artifact_href = artifacts.get(filename) + main_artifact_hrefs.add(main_artifact_href) + main_artifact = pulpcore_bindings.ArtifactsApi.read(main_artifact_href) + + metadata_artifact_href = artifacts.get(f"{filename}.metadata") + metadata_artifact_hrefs.add(metadata_artifact_href) + metadata_artifact = pulpcore_bindings.ArtifactsApi.read(metadata_artifact_href) + + pkg = python_bindings.ContentPackagesApi.read(content_hrefs[filename]) + assert pkg.metadata_sha256 == metadata_sha256 + assert main_artifact.sha256 == pkg.sha256 + assert metadata_artifact.sha256 == pkg.metadata_sha256 + + # Check deduplication + assert len(main_artifact_hrefs) == 4 + assert len(metadata_artifact_hrefs) == 3 diff --git a/pyproject.toml b/pyproject.toml index 992aac8f..cdbc7bec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,9 @@ underlines = ["", "", ""] [tool.check-manifest] ignore = [ + "AGENTS.md", "CHANGES/**", + "CLAUDE.md", "dev_requirements.txt", "doc_requirements.txt", "docs/**",