diff --git a/internal/operator-controller/applier/boxcutter.go b/internal/operator-controller/applier/boxcutter.go index fc22df818..f1f9bdd8d 100644 --- a/internal/operator-controller/applier/boxcutter.go +++ b/internal/operator-controller/applier/boxcutter.go @@ -12,6 +12,7 @@ import ( "slices" "strings" + "github.com/go-logr/logr" "helm.sh/helm/v3/pkg/release" "helm.sh/helm/v3/pkg/storage/driver" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -153,8 +154,61 @@ func (r *SimpleRevisionGenerator) GenerateRevision( return r.buildClusterExtensionRevision(objs, ext, revisionAnnotations), nil } +// removePodTemplateMetadata conditionally removes empty pod template annotations from Deployments. +// Empty annotations are removed to prevent OLM from claiming ownership via Server-Side Apply, +// allowing users to add custom annotations. Non-empty (bundle-provided) annotations are preserved. +// Labels are always preserved as they are required for selector matching and chart functionality. +func removePodTemplateMetadata(unstr *unstructured.Unstructured, l logr.Logger) { + if unstr.GetKind() != "Deployment" || unstr.GroupVersionKind().Group != "apps" { + return + } + + obj := unstr.Object + spec, ok := obj["spec"].(map[string]any) + if !ok { + return + } + + template, ok := spec["template"].(map[string]any) + if !ok { + return + } + + templateMeta, ok := template["metadata"].(map[string]any) + if !ok { + return + } + + // Remove pod template annotations only when the rendered manifest has no annotations. + // This allows users to add custom annotations (like kubectl rollout restart) without + // OLM overwriting them, while still preserving any chart-provided annotations when present. + if annotationsVal, hasAnnotations := templateMeta["annotations"]; hasAnnotations { + if annotationsMap, ok := annotationsVal.(map[string]any); ok { + if len(annotationsMap) == 0 { + delete(templateMeta, "annotations") + l.V(1).Info("removed empty pod template annotations from Deployment to preserve user changes", + "deployment", unstr.GetName()) + } else { + // Non-empty annotations are preserved so bundle/chart-provided annotations + // continue to be applied and maintained by OLM. + l.V(2).Info("preserving non-empty pod template annotations on Deployment", + "deployment", unstr.GetName(), "count", len(annotationsMap)) + } + } + } +} + // sanitizedUnstructured takes an unstructured obj, removes status if present, and returns a sanitized copy containing only the allowed metadata entries set below. // If any unallowed entries are removed, a warning will be logged. +// +// For Deployment objects, this function conditionally removes empty pod template annotations. +// Bundle-provided annotations are preserved to maintain operator functionality. +// Empty annotations are removed to allow users to add custom annotations without OLM reverting them. +// Examples of user annotations: "kubectl rollout restart", "kubectl annotate", custom monitoring annotations. +// Labels are kept because: (1) deployment selector must match template labels, and +// (2) chart-provided labels may be referenced by other resources. +// This fixes the issue where user changes to pod template annotations would be undone by OLM. +// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392 func sanitizedUnstructured(ctx context.Context, unstr *unstructured.Unstructured) { l := log.FromContext(ctx) obj := unstr.Object @@ -193,6 +247,15 @@ func sanitizedUnstructured(ctx context.Context, unstr *unstructured.Unstructured l.Info("warning: extraneous values removed from manifest metadata", "allowed metadata", allowedMetadata) } obj["metadata"] = metadataSanitized + + // For Deployment objects, conditionally remove empty pod template annotations (always keep labels). + // Empty annotations are removed to prevent OLM from claiming ownership. + // Non-empty (bundle-provided) annotations are preserved. + // This allows users to add custom annotations (kubectl rollout restart, kubectl annotate, etc.) + // without OLM overwriting them. + // Labels are always preserved because the deployment selector must match template labels, + // and chart-provided labels may be needed by other resources. + removePodTemplateMetadata(unstr, l) } func (r *SimpleRevisionGenerator) buildClusterExtensionRevision( diff --git a/internal/operator-controller/applier/boxcutter_test.go b/internal/operator-controller/applier/boxcutter_test.go index 18f2758b4..3b2185823 100644 --- a/internal/operator-controller/applier/boxcutter_test.go +++ b/internal/operator-controller/applier/boxcutter_test.go @@ -1884,3 +1884,144 @@ func (s *statusWriterMock) Apply(ctx context.Context, obj runtime.ApplyConfigura // helps ensure tests fail if this assumption changes return fmt.Errorf("unexpected call to StatusWriter.Apply() - this method is not expected to be used in these tests") } + +func Test_SimpleRevisionGenerator_PodTemplateAnnotationSanitization(t *testing.T) { + tests := []struct { + name string + podTemplateAnnotations map[string]string + expectAnnotationsInRevision bool + }{ + { + name: "deployment with non-empty pod template annotations preserves them", + podTemplateAnnotations: map[string]string{ + "kubectl.kubernetes.io/default-container": "main", + "prometheus.io/scrape": "true", + }, + expectAnnotationsInRevision: true, + }, + { + name: "deployment with empty pod template annotations removes them", + podTemplateAnnotations: map[string]string{}, + expectAnnotationsInRevision: false, + }, + { + name: "deployment with nil pod template annotations has none in revision", + podTemplateAnnotations: nil, + expectAnnotationsInRevision: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Create a deployment with specified pod template annotations + deployment := &appsv1.Deployment{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + }, + Spec: appsv1.DeploymentSpec{ + Replicas: ptr.To(int32(1)), + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "test"}, + }, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "test"}, + Annotations: tt.podTemplateAnnotations, + }, + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + { + Name: "main", + Image: "nginx:latest", + }, + }, + }, + }, + }, + } + + // Create revision generator with fake manifest provider + scheme := runtime.NewScheme() + require.NoError(t, k8scheme.AddToScheme(scheme)) + require.NoError(t, ocv1.AddToScheme(scheme)) + + ext := &ocv1.ClusterExtension{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-extension", + }, + } + + manifestProvider := &FakeManifestProvider{ + GetFn: func(_ fs.FS, _ *ocv1.ClusterExtension) ([]client.Object, error) { + return []client.Object{deployment}, nil + }, + } + + rg := &applier.SimpleRevisionGenerator{ + Scheme: scheme, + ManifestProvider: manifestProvider, + } + + // Create a valid bundle FS + bundleFS := bundlefs.Builder(). + WithPackageName("test-package"). + WithCSV(clusterserviceversion.Builder().WithName("test-csv").Build()). + Build() + + // Generate revision + revision, err := rg.GenerateRevision( + context.Background(), + bundleFS, + ext, + map[string]string{"test": "label"}, + nil, + ) + require.NoError(t, err) + require.NotNil(t, revision) + + // Find the deployment in the revision + var deploymentFound bool + for _, obj := range revision.Spec.Phases[0].Objects { + if obj.Object.GetKind() == "Deployment" { + deploymentFound = true + + // Check pod template annotations + spec, ok := obj.Object.Object["spec"].(map[string]any) + require.True(t, ok, "deployment should have spec") + + template, ok := spec["template"].(map[string]any) + require.True(t, ok, "deployment spec should have template") + + templateMeta, ok := template["metadata"].(map[string]any) + require.True(t, ok, "pod template should have metadata") + + annotations, hasAnnotations := templateMeta["annotations"] + if tt.expectAnnotationsInRevision { + assert.True(t, hasAnnotations, "expected annotations to be present in revision") + annotationsMap, ok := annotations.(map[string]any) + require.True(t, ok, "annotations should be a map") + + for key, expectedValue := range tt.podTemplateAnnotations { + actualValue, exists := annotationsMap[key] + assert.True(t, exists, "expected annotation key %s to exist", key) + assert.Equal(t, expectedValue, actualValue, "annotation value mismatch for key %s", key) + } + assert.Len(t, annotationsMap, len(tt.podTemplateAnnotations), "annotation count mismatch") + } else { + assert.False(t, hasAnnotations, "expected annotations to be removed from revision") + } + + // Labels should always be preserved + labels, hasLabels := templateMeta["labels"] + assert.True(t, hasLabels, "labels should always be preserved") + labelsMap, ok := labels.(map[string]any) + require.True(t, ok, "labels should be a map") + assert.Equal(t, "test", labelsMap["app"], "label app should be preserved") + + break + } + } + assert.True(t, deploymentFound, "deployment should be found in revision") + }) + } +} diff --git a/test/e2e/features/rollout-restart.feature b/test/e2e/features/rollout-restart.feature new file mode 100644 index 000000000..e451fe8d7 --- /dev/null +++ b/test/e2e/features/rollout-restart.feature @@ -0,0 +1,42 @@ +Feature: Rollout Restart User Changes + # Verifies that user-added pod template annotations persist after OLM reconciliation. + # Fixes: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392 + + Background: + Given OLM is available + And ClusterCatalog "test" serves bundles + And ServiceAccount "olm-sa" with needed permissions is available in ${TEST_NAMESPACE} + + @BoxcutterRuntime + Scenario: User-initiated deployment changes persist after OLM reconciliation + When ClusterExtension is applied + """ + apiVersion: olm.operatorframework.io/v1 + kind: ClusterExtension + metadata: + name: ${NAME} + spec: + namespace: ${TEST_NAMESPACE} + serviceAccount: + name: olm-sa + source: + sourceType: Catalog + catalog: + packageName: test + selector: + matchLabels: + "olm.operatorframework.io/metadata.name": test-catalog + """ + Then ClusterExtension is rolled out + And ClusterExtension is available + And resource "deployment/test-operator" is installed + And deployment "test-operator" is ready + When user performs rollout restart on "deployment/test-operator" + Then deployment "test-operator" rollout completes successfully + And resource "deployment/test-operator" has restart annotation + # Wait for OLM reconciliation cycle (runs every 10 seconds) + And I wait for "30" seconds + # Verify user changes persisted after OLM reconciliation + Then deployment "test-operator" rollout is still successful + And resource "deployment/test-operator" has restart annotation + And deployment "test-operator" has expected number of ready replicas diff --git a/test/e2e/steps/steps.go b/test/e2e/steps/steps.go index 483514e40..482cf1df3 100644 --- a/test/e2e/steps/steps.go +++ b/test/e2e/steps/steps.go @@ -14,6 +14,7 @@ import ( "os/exec" "path/filepath" "reflect" + "strconv" "strings" "time" @@ -87,6 +88,13 @@ func RegisterSteps(sc *godog.ScenarioContext) { sc.Step(`^(?i)resource apply fails with error msg containing "([^"]+)"$`, ResourceApplyFails) sc.Step(`^(?i)resource "([^"]+)" is eventually restored$`, ResourceRestored) sc.Step(`^(?i)resource "([^"]+)" matches$`, ResourceMatches) + sc.Step(`^(?i)user performs rollout restart on "([^"]+)"$`, UserPerformsRolloutRestart) + sc.Step(`^(?i)resource "([^"]+)" has restart annotation$`, ResourceHasRestartAnnotation) + sc.Step(`^(?i)deployment "([^"]+)" is ready$`, DeploymentIsReady) + sc.Step(`^(?i)deployment "([^"]+)" rollout completes successfully$`, DeploymentRolloutCompletesSuccessfully) + sc.Step(`^(?i)I wait for "([^"]+)" seconds$`, WaitForSeconds) + sc.Step(`^(?i)deployment "([^"]+)" rollout is still successful$`, DeploymentRolloutIsStillSuccessful) + sc.Step(`^(?i)deployment "([^"]+)" has expected number of ready replicas$`, DeploymentHasExpectedReadyReplicas) sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in test namespace$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace) sc.Step(`^(?i)ServiceAccount "([^"]*)" with needed permissions is available in \${TEST_NAMESPACE}$`, ServiceAccountWithNeededPermissionsIsAvailableInNamespace) @@ -1168,3 +1176,265 @@ func latestActiveRevisionForExtension(extName string) (*ocv1.ClusterExtensionRev return latest, nil } + +// UserPerformsRolloutRestart simulates a user running "kubectl rollout restart deployment/". +// This adds a restart annotation to trigger a rolling restart of pods. +// This is used to test the generic fix - OLM should not undo ANY user-added annotations. +// In OLMv0, OLM would undo this change. In OLMv1, it should stay because kubectl owns it. +// See: https://github.com/operator-framework/operator-lifecycle-manager/issues/3392 +func UserPerformsRolloutRestart(ctx context.Context, resourceName string) error { + sc := scenarioCtx(ctx) + resourceName = substituteScenarioVars(resourceName, sc) + + kind, deploymentName, ok := strings.Cut(resourceName, "/") + if !ok { + return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName) + } + + if kind != "deployment" { + return fmt.Errorf("only deployment resources are supported for restart annotation, got: %s", kind) + } + + // Run kubectl rollout restart to add the restart annotation. + // This is the real command users run, so we test actual user behavior. + out, err := k8sClient("rollout", "restart", resourceName, "-n", sc.namespace) + if err != nil { + return fmt.Errorf("failed to rollout restart %s: %w", resourceName, err) + } + + logger.V(1).Info("Rollout restart initiated", "deployment", deploymentName, "output", out) + + return nil +} + +// ResourceHasRestartAnnotation checks that a deployment has a restart annotation. +// This confirms that user changes stay in place after OLM runs. +// The fix is generic and works for ANY user-added annotations on pod templates. +func ResourceHasRestartAnnotation(ctx context.Context, resourceName string) error { + sc := scenarioCtx(ctx) + resourceName = substituteScenarioVars(resourceName, sc) + + kind, deploymentName, ok := strings.Cut(resourceName, "/") + if !ok { + return fmt.Errorf("invalid resource name format: %s (expected kind/name)", resourceName) + } + + if kind != "deployment" { + return fmt.Errorf("only deployment resources are supported for restart annotation check, got: %s", kind) + } + + // Look for the restart annotation added by "kubectl rollout restart" + restartAnnotationKey := "kubectl.kubernetes.io/restartedAt" + + // Keep checking until the annotation appears (it may not show up immediately) + // This is better than checking only once + var annotationValue string + waitFor(ctx, func() bool { + out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", fmt.Sprintf("jsonpath={.spec.template.metadata.annotations['%s']}", restartAnnotationKey)) + if err != nil { + return false + } + // If the annotation exists and has a value, it stayed in place + if out == "" { + return false + } + annotationValue = out + return true + }) + + logger.V(1).Info("Restart annotation found", "deployment", deploymentName, "restartedAt", annotationValue) + return nil +} + +// DeploymentIsReady checks that a deployment is ready with all pods running. +func DeploymentIsReady(ctx context.Context, deploymentName string) error { + sc := scenarioCtx(ctx) + deploymentName = substituteScenarioVars(deploymentName, sc) + + waitFor(ctx, func() bool { + // Check if deployment is ready + out, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}") + if err != nil { + return false + } + return out == "True" + }) + + logger.V(1).Info("Deployment is ready", "deployment", deploymentName) + return nil +} + +// DeploymentRolloutCompletesSuccessfully waits for the rollout to finish. +// This checks that new pods were created and are running. +func DeploymentRolloutCompletesSuccessfully(ctx context.Context, deploymentName string) error { + sc := scenarioCtx(ctx) + deploymentName = substituteScenarioVars(deploymentName, sc) + + // Use kubectl rollout status to wait until done. + // This makes sure new pods are created and running. + // Timeout is 7m to account for startup probes (test-operator has 5min startup probe) + out, err := k8sClient("rollout", "status", "deployment/"+deploymentName, "-n", sc.namespace, "--timeout=7m") + if err != nil { + return fmt.Errorf("deployment rollout failed: %w, output: %s", err, out) + } + + logger.V(1).Info("Deployment rollout completed", "deployment", deploymentName, "status", out) + + // Check deployment status + available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}") + if err != nil { + return fmt.Errorf("failed to check deployment availability: %w", err) + } + if available != "True" { + return fmt.Errorf("deployment %s is not available", deploymentName) + } + + progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}") + if err != nil { + return fmt.Errorf("failed to check deployment progressing: %w", err) + } + if progressing != "True" { + return fmt.Errorf("deployment %s is not progressing correctly", deploymentName) + } + + return nil +} + +// WaitForSeconds waits for the given number of seconds. +// This gives OLM time to run its checks between test steps. +// +// Note: We wait for a fixed time (not checking in a loop) because we need to make sure +// OLM actually runs (it runs every 10 seconds). We want to check that user changes stay +// AFTER OLM runs. If we just checked in a loop, we wouldn't know if OLM ran yet. +func WaitForSeconds(ctx context.Context, seconds string) error { + sec, err := strconv.Atoi(seconds) + if err != nil { + return fmt.Errorf("invalid seconds value %s: %w", seconds, err) + } + + if sec <= 0 { + return fmt.Errorf("seconds value must be greater than 0, got %d", sec) + } + + logger.V(1).Info("Waiting for reconciliation", "seconds", sec) + + // Use select so the wait can be stopped if needed + dur := time.Duration(sec) * time.Second + select { + case <-time.After(dur): + logger.V(1).Info("Wait complete") + return nil + case <-ctx.Done(): + return fmt.Errorf("wait canceled: %w", ctx.Err()) + } +} + +// verifyDeploymentReplicaStatus checks if a deployment has the right number of ready pods. +func verifyDeploymentReplicaStatus(deploymentName, namespace string) (string, string, error) { + readyReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", namespace, + "-o", "jsonpath={.status.readyReplicas}") + if err != nil { + return "", "", fmt.Errorf("failed to get ready replicas: %w", err) + } + + replicas, err := k8sClient("get", "deployment", deploymentName, "-n", namespace, + "-o", "jsonpath={.spec.replicas}") + if err != nil { + return "", "", fmt.Errorf("failed to get desired replicas: %w", err) + } + + // Normalize empty jsonpath results (when readyReplicas is omitted for 0 replicas) + readyReplicas = strings.TrimSpace(readyReplicas) + replicas = strings.TrimSpace(replicas) + if readyReplicas == "" { + readyReplicas = "0" + } + if replicas == "" { + replicas = "0" + } + + // Compare as integers to avoid false negatives + readyInt, err := strconv.Atoi(readyReplicas) + if err != nil { + return readyReplicas, replicas, fmt.Errorf("invalid ready replicas value %q: %w", readyReplicas, err) + } + replicasInt, err := strconv.Atoi(replicas) + if err != nil { + return readyReplicas, replicas, fmt.Errorf("invalid desired replicas value %q: %w", replicas, err) + } + + if readyInt != replicasInt { + return readyReplicas, replicas, fmt.Errorf("deployment %s has %d ready replicas but expected %d", + deploymentName, readyInt, replicasInt) + } + + return readyReplicas, replicas, nil +} + +// DeploymentRolloutIsStillSuccessful checks that the rollout is still working. +// This makes sure OLM didn't undo the user's "kubectl rollout restart" command. +// It checks that new pods are still running. +func DeploymentRolloutIsStillSuccessful(ctx context.Context, deploymentName string) error { + sc := scenarioCtx(ctx) + deploymentName = substituteScenarioVars(deploymentName, sc) + + // Check deployment status + available, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", "jsonpath={.status.conditions[?(@.type=='Available')].status}") + if err != nil { + return fmt.Errorf("failed to check deployment availability: %w", err) + } + if available != "True" { + return fmt.Errorf("deployment %s is not available - rollout was undone", deploymentName) + } + + // Check that the deployment is still working correctly + progressing, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", "jsonpath={.status.conditions[?(@.type=='Progressing')].status}") + if err != nil { + return fmt.Errorf("failed to check deployment progressing: %w", err) + } + if progressing != "True" { + return fmt.Errorf("deployment %s is not working - rollout may have been undone", deploymentName) + } + + // Check that the right number of pods are ready (rollout finished and wasn't stopped) + readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace) + if err != nil { + return fmt.Errorf("%w - rollout may have been undone", err) + } + + logger.V(1).Info("Deployment rollout is still successful", "deployment", deploymentName, + "readyReplicas", readyReplicas, "desiredReplicas", replicas) + + return nil +} + +// DeploymentHasExpectedReadyReplicas checks that the deployment has the right number of ready pods. +// This makes sure the rollout finished successfully and pods are running. +func DeploymentHasExpectedReadyReplicas(ctx context.Context, deploymentName string) error { + sc := scenarioCtx(ctx) + deploymentName = substituteScenarioVars(deploymentName, sc) + + // Check that the right number of pods are ready + readyReplicas, replicas, err := verifyDeploymentReplicaStatus(deploymentName, sc.namespace) + if err != nil { + return err + } + + // Also check that there are no unavailable pods + unavailableReplicas, err := k8sClient("get", "deployment", deploymentName, "-n", sc.namespace, + "-o", "jsonpath={.status.unavailableReplicas}") + if err == nil && unavailableReplicas != "" && unavailableReplicas != "0" { + return fmt.Errorf("deployment %s has %s unavailable pods", deploymentName, unavailableReplicas) + } + + logger.V(1).Info("Deployment has expected ready replicas", "deployment", deploymentName, + "readyReplicas", readyReplicas, "desiredReplicas", replicas) + + return nil +}