Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES/1099.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added support for recreating and fixing metadata files to `repair_metadata` endpoint.
136 changes: 129 additions & 7 deletions pulp_python/app/tasks/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
from django.db.models.query import QuerySet
from pulp_python.app.models import PythonPackageContent, PythonRepository
from pulp_python.app.utils import (
artifact_to_metadata_artifact,
artifact_to_python_content_data,
fetch_json_release_metadata,
parse_metadata,
)
from pulpcore.plugin.models import ContentArtifact, ProgressReport
from pulpcore.plugin.models import Artifact, ContentArtifact, Domain, ProgressReport
from pulpcore.plugin.util import get_domain

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -41,16 +42,25 @@ def repair(repository_pk: UUID) -> None:
content_set = repository.latest_version().content.values_list("pk", flat=True)
content = PythonPackageContent.objects.filter(pk__in=content_set)

num_repaired, pkgs_not_repaired = repair_metadata(content)
num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired = (
repair_metadata(content)
)
# Convert set() to 0
if not pkgs_not_repaired:
pkgs_not_repaired = 0
if not pkgs_metadata_not_repaired:
pkgs_metadata_not_repaired = 0

log.info(
_(
"{} packages' metadata repaired. Not repaired packages due to either "
"inaccessible URL or mismatched sha256: {}."
).format(num_repaired, pkgs_not_repaired)
"inaccessible URL or mismatched sha256: {}. "
"{} metadata files repaired. Packages whose metadata files could not be repaired: {}."
).format(num_repaired, pkgs_not_repaired, num_metadata_repaired, pkgs_metadata_not_repaired)
)


def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str]]:
def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[str], int, set[str]]:
"""
Repairs metadata for a queryset of PythonPackageContent objects
and updates the progress report.
Expand All @@ -59,9 +69,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
content (QuerySet[PythonPackageContent]): The queryset of items to repair.

Returns:
tuple[int, set[str]]: A tuple containing:
tuple[int, set[str], int, set[str]]: A tuple containing:
- The number of packages that were repaired.
- A set of packages' PKs that were not repaired.
- The number of metadata files that were repaired.
- A set of packages' PKs without repaired metadata artifacts.
"""
immediate_content = (
content.filter(contentartifact__artifact__isnull=False)
Expand All @@ -87,6 +99,11 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
# Keep track of on-demand packages that were not repaired
pkgs_not_repaired = set()

# Metadata artifacts and content artifacts
metadata_batch = []
total_metadata_repaired = 0
pkgs_metadata_not_repaired = set()

progress_report = ProgressReport(
message="Repairing packages' metadata",
code="repair.metadata",
Expand All @@ -102,6 +119,14 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
.artifact
)
new_data = artifact_to_python_content_data(package.filename, main_artifact, domain)
total_metadata_repaired += update_metadata_artifact_if_needed(
package,
new_data.get("metadata_sha256"),
main_artifact,
domain,
metadata_batch,
pkgs_metadata_not_repaired,
)
total_repaired += update_package_if_needed(
package, new_data, batch, set_of_update_fields
)
Expand Down Expand Up @@ -163,7 +188,12 @@ def repair_metadata(content: QuerySet[PythonPackageContent]) -> tuple[int, set[s
total_repaired += len(batch)
PythonPackageContent.objects.bulk_update(batch, set_of_update_fields)

return total_repaired, pkgs_not_repaired
if metadata_batch:
not_repaired = _process_metadata_batch(metadata_batch)
pkgs_metadata_not_repaired.update(not_repaired)
total_metadata_repaired += len(metadata_batch) - len(not_repaired)

return total_repaired, pkgs_not_repaired, total_metadata_repaired, pkgs_metadata_not_repaired


def update_package_if_needed(
Expand Down Expand Up @@ -202,3 +232,95 @@ def update_package_if_needed(
set_of_update_fields.clear()

return total_repaired


def update_metadata_artifact_if_needed(
package: PythonPackageContent,
new_metadata_sha256: str | None,
main_artifact: Artifact,
domain: Domain,
metadata_batch: list[tuple],
pkgs_metadata_not_repaired: set[str],
) -> int:
"""
Repairs metadata artifacts for wheel packages by creating missing metadata artifacts
or updating existing ones when the metadata_sha256 differs. Only processes wheel files
that have a valid new_metadata_sha256. Queues operations for batch processing.

Args:
package: Package to check for metadata changes.
new_metadata_sha256: The correct metadata_sha256 extracted from the main artifact, or None.
main_artifact: The main package artifact used to generate metadata.
domain: The domain in which the metadata artifact will be created.
metadata_batch: List of tuples for batch processing (updated in-place).
pkgs_metadata_not_repaired: Set of package PKs that failed repair (updated in-place).

Returns:
Number of repaired metadata artifacts (only when batch is flushed at BULK_SIZE).
"""
total_metadata_repaired = 0

if not package.filename.endswith(".whl") or not new_metadata_sha256:
return total_metadata_repaired

original_metadata_sha256 = package.metadata_sha256
cas = package.contentartifact_set.filter(relative_path__endswith=".metadata")

# Create missing
if not cas:
metadata_batch.append(("create", package, main_artifact, None, domain))
# Fix existing
elif new_metadata_sha256 != original_metadata_sha256:
ca = cas.first()
metadata_artifact = ca.artifact
if metadata_artifact is None or (metadata_artifact.sha256 != new_metadata_sha256):
metadata_batch.append(("update", package, main_artifact, ca, domain))

if len(metadata_batch) == BULK_SIZE:
not_repaired = _process_metadata_batch(metadata_batch)
pkgs_metadata_not_repaired.update(not_repaired)
total_metadata_repaired += BULK_SIZE - len(not_repaired)
metadata_batch.clear()

return total_metadata_repaired


def _process_metadata_batch(metadata_batch: list[tuple]) -> set[str]:
"""
Processes a batch of metadata repair operations by creating metadata artifacts
and their corresponding ContentArtifacts.

Args:
metadata_batch: List of (action, package, main_artifact, content_artifact, domain) tuples.

Returns:
Set of package PKs for which metadata artifacts could not be created.
"""
not_repaired = set()
content_artifacts_to_create = []
content_artifacts_to_update = []

for action, package, main_artifact, content_artifact, domain in metadata_batch:
metadata_artifact = artifact_to_metadata_artifact(package.filename, main_artifact)
if metadata_artifact:
metadata_artifact.pulp_domain = domain
metadata_artifact.save()
if action == "create":
ca = ContentArtifact(
artifact=metadata_artifact,
content=package,
relative_path=f"{package.filename}.metadata",
)
content_artifacts_to_create.append(ca)
elif action == "update":
content_artifact.artifact = metadata_artifact
content_artifacts_to_update.append(content_artifact)
else:
not_repaired.add(package.pk)

if content_artifacts_to_create:
ContentArtifact.objects.bulk_create(content_artifacts_to_create)
if content_artifacts_to_update:
ContentArtifact.objects.bulk_update(content_artifacts_to_update, ["artifact"])

return not_repaired
121 changes: 119 additions & 2 deletions pulp_python/tests/functional/api/test_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

@pytest.fixture
def create_content_direct(python_bindings):
def _create(artifact_filename, content_data):
def _create(artifact_filename, content_data, metadata_artifact_filename=None):
commands = (
"from pulpcore.plugin.models import Artifact, ContentArtifact; "
"from pulpcore.plugin.util import get_url; "
Expand All @@ -21,8 +21,16 @@ def _create(artifact_filename, content_data):
"c.save(); "
f"ca = ContentArtifact(artifact=a, content=c, relative_path=c.filename); "
"ca.save(); "
"print(get_url(c))"
)
if metadata_artifact_filename:
commands += (
f"a2 = Artifact.init_and_validate('{metadata_artifact_filename}'); "
"a2.save(); "
"ca2_filename = c.filename + '.metadata'; "
f"ca2 = ContentArtifact(artifact=a2, content=c, relative_path=ca2_filename); "
"ca2.save(); "
)
commands += "print(get_url(c))"
process = subprocess.run(["pulpcore-manager", "shell", "-c", commands], capture_output=True)

assert process.returncode == 0
Expand Down Expand Up @@ -214,3 +222,112 @@ def test_metadata_repair_endpoint(
assert new_content.author == author
assert new_content.packagetype == packagetype
assert new_content.requires_python == requires_python


def test_metadata_artifact_repair_endpoint(
create_content_direct,
delete_orphans_pre,
download_python_file,
monitor_task,
move_to_repository,
pulpcore_bindings,
python_bindings,
python_repo_factory,
):
"""
Test repairing of PythonPackageContent's metadata_sha256 and its metadata Artifact
and ContentArtifact via `Repositories.repair_metadata` endpoint.
"""
# 1. Setup tested data
python_repo = python_repo_factory()

# missing metadata_sha256, missing metadata Artifact + ContentArtifact
filename_1 = "scipy-1.1.0-cp27-none-win_amd64.whl"
metadata_1 = None
url_1 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_1)
file_1 = download_python_file(filename_1, url_1)

# correct metadata_sha256, missing metadata Artifact + ContentArtifact
filename_2 = "scipy-1.1.0-cp27-cp27m-manylinux1_x86_64.whl"
metadata_2 = "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"
url_2 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_2)
file_2 = download_python_file(filename_2, url_2)

# wrong metadata_sha256, missing metadata Artifact + ContentArtifact
filename_3 = "scipy-1.1.0-cp34-none-win32.whl"
metadata_3 = "1234"
url_3 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_3)
file_3 = download_python_file(filename_3, url_3)

# wrong metadata_sha256, wrong metadata Artifact, correct metadata ContentArtifact
filename_4 = "scipy-1.1.0-cp35-none-win32.whl"
metadata_4 = "5678"
url_4 = urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), filename_4)
file_4 = download_python_file(filename_4, url_4)
metadata_file_4 = download_python_file(
f"{filename_1}.metadata",
urljoin(urljoin(PYTHON_FIXTURES_URL, "packages/"), f"{filename_1}.metadata"),
)

# Build PythonPackageContent data
filenames = [filename_1, filename_2, filename_3, filename_4]
metadata_sha256s = [metadata_1, metadata_2, metadata_3, metadata_4]
data_1, data_2, data_3, data_4 = [
{"name": "scipy", "version": "1.1.0", "filename": f, "metadata_sha256": m}
for f, m in zip(filenames, metadata_sha256s)
]

# 2. Create content
content_1 = create_content_direct(file_1, data_1)
content_2 = create_content_direct(file_2, data_2)
content_3 = create_content_direct(file_3, data_3)
content_4 = create_content_direct(file_4, data_4, metadata_file_4)

content_hrefs = {}
for data, content in [
(data_1, content_1),
(data_2, content_2),
(data_3, content_3),
(data_4, content_4),
]:
for field, test_value in data.items():
assert getattr(content, field) == test_value
content_hrefs[data["filename"]] = content.pulp_href
move_to_repository(python_repo.pulp_href, list(content_hrefs.values()))

# 3. Repair metadata and metadata files
response = python_bindings.RepositoriesPythonApi.repair_metadata(python_repo.pulp_href)
monitor_task(response.task)

# 4. Check new metadata and metadata files
main_artifact_hrefs = set()
metadata_artifact_hrefs = set()
new_data = [
(filename_1, "15ae132303b2774a0d839d01c618cf99fc92716adfaaa2bc1267142ab2b76b98"),
(filename_2, "7f303850d9be88fff27eaeb393c2fd3a6c1a130e21758b8294fc5bb2f38e02f6"),
# filename_3 and filename_4 have the same metadata file
(filename_3, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"),
(filename_4, "747d24e500308067c4e5fd0e20fb2d4fd6595a3fb7b1d2ffa717217fb6a53364"),
]
for filename, metadata_sha256 in new_data:
content = pulpcore_bindings.ContentApi.list(pulp_href__in=[content_hrefs[filename]]).results
assert content
artifacts = content[0].artifacts
assert len(artifacts) == 2

main_artifact_href = artifacts.get(filename)
main_artifact_hrefs.add(main_artifact_href)
main_artifact = pulpcore_bindings.ArtifactsApi.read(main_artifact_href)

metadata_artifact_href = artifacts.get(f"{filename}.metadata")
metadata_artifact_hrefs.add(metadata_artifact_href)
metadata_artifact = pulpcore_bindings.ArtifactsApi.read(metadata_artifact_href)

pkg = python_bindings.ContentPackagesApi.read(content_hrefs[filename])
assert pkg.metadata_sha256 == metadata_sha256
assert main_artifact.sha256 == pkg.sha256
assert metadata_artifact.sha256 == pkg.metadata_sha256

# Check deduplication
assert len(main_artifact_hrefs) == 4
assert len(metadata_artifact_hrefs) == 3
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ underlines = ["", "", ""]

[tool.check-manifest]
ignore = [
"AGENTS.md",
"CHANGES/**",
"CLAUDE.md",
"dev_requirements.txt",
"doc_requirements.txt",
"docs/**",
Expand Down