diff --git a/metrics-collector/.ceignore b/metrics-collector/.ceignore new file mode 100644 index 000000000..a5ed506b1 --- /dev/null +++ b/metrics-collector/.ceignore @@ -0,0 +1,2 @@ +images/ +setup/ \ No newline at end of file diff --git a/metrics-collector/Dockerfile b/metrics-collector/Dockerfile index 111dd9481..c47bf3e6d 100644 --- a/metrics-collector/Dockerfile +++ b/metrics-collector/Dockerfile @@ -1,11 +1,51 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +# Stage 1: Build Go binary +FROM quay.io/projectquay/golang:1.25 AS go-builder WORKDIR /go/src/app -COPY . . - +COPY go.mod go.sum ./ RUN go mod download -RUN CGO_ENABLED=0 go build -o /go/bin/app main.go +COPY main.go ./ +RUN CGO_ENABLED=0 GOOS=linux go build -a -installsuffix cgo -o app main.go + +# Stage 2: Download and extract Prometheus +FROM busybox:1.36-glibc AS prometheus-downloader +ARG PROMETHEUS_VERSION=3.9.1 +ARG TARGETARCH=amd64 + +WORKDIR /tmp +RUN wget https://github.com/prometheus/prometheus/releases/download/v${PROMETHEUS_VERSION}/prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \ + tar xzf prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}.tar.gz && \ + mv prometheus-${PROMETHEUS_VERSION}.linux-${TARGETARCH}/prometheus /prometheus + +# Stage 3: Get CA certificates +FROM alpine:latest AS certs +RUN apk --no-cache add ca-certificates + +# Stage 4: Runtime image +FROM busybox:1.36-glibc + +# Copy CA certificates for TLS verification +COPY --from=certs /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ca-certificates.crt + +# Copy Go binary +COPY --from=go-builder /go/src/app/app /app + +# Copy Prometheus binary +COPY --from=prometheus-downloader /prometheus /bin/prometheus + +# Copy configuration and scripts +COPY prometheus.yml.template /etc/prometheus/prometheus.yml.template +COPY start.sh /start.sh +RUN chmod +x /start.sh + +# Create necessary directories with proper permissions +RUN mkdir -p /tmp/agent-data && \ + mkdir -p /etc/secrets && \ + chmod 777 /tmp/agent-data + +# Set SSL certificate path environment variable +ENV SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt + +# Use non-root user +USER 1000:1000 -# Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 -COPY --from=build-env /go/bin/app / -CMD ["/app"] +ENTRYPOINT ["/start.sh"] diff --git a/metrics-collector/README.md b/metrics-collector/README.md index 9abb6844a..d531cf74f 100644 --- a/metrics-collector/README.md +++ b/metrics-collector/README.md @@ -6,7 +6,7 @@ Code Engine job that demonstrates how to collect resource metrics (CPU, memory a ## Installation -### Capture metrics every n seconds +## Capture metrics every n seconds * Create Code Engine job template ``` @@ -27,33 +27,136 @@ $ ibmcloud ce jobrun submit \ ``` -### Capture metrics every n minutes +## Send metrics to IBM Cloud Monitoring -* Create Code Engine job template +When `METRICS_ENABLED=true`, the metrics collector runs an embedded Prometheus agent that scrapes metrics from the local `/metrics` endpoint and forwards them to IBM Cloud Monitoring. + +![](./images/monitoring-dashboard-ce-component-resources.png) + +### Prerequisites + +1. **IBM Cloud Monitoring Instance**: You need an IBM Cloud Monitoring instance with an API key +2. **Code Engine project**: The collector must run in a Code Engine project + +### Setup Instructions + +**Step 1: Create a secret with your IBM Cloud Monitoring API key** +```bash +ibmcloud ce secret create --name monitoring-apikey --from-literal monitoring-apikey= ``` -$ ibmcloud ce job create \ + +**Step 2: Determine your IBM Cloud Monitoring ingestion endpoint** + +The `METRICS_REMOTE_WRITE_FQDN` depends on your IBM Cloud Monitoring instance region: +- **US South (Dallas)**: `ingest.prws.us-south.monitoring.cloud.ibm.com` +- **US East (Washington DC)**: `ingest.prws.us-east.monitoring.cloud.ibm.com` +- **EU Central (Frankfurt)**: `ingest.prws.eu-de.monitoring.cloud.ibm.com` +- **EU GB (London)**: `ingest.prws.eu-gb.monitoring.cloud.ibm.com` +- **JP Tokyo**: `ingest.prws.jp-tok.monitoring.cloud.ibm.com` +- **AU Sydney**: `ingest.prws.au-syd.monitoring.cloud.ibm.com` + +**Step 3: Update your job with the required configuration** +```bash +ibmcloud ce job create \ --name metrics-collector \ - --src . \ - --mode task \ + --src "." \ + --mode daemon \ --cpu 0.25 \ --memory 0.5G \ - --wait + --build-size xlarge \ + --env INTERVAL=30 \ + --env METRICS_ENABLED=true \ + --env METRICS_REMOTE_WRITE_FQDN=ingest.prws.eu-es.monitoring.cloud.ibm.com \ + --mount-secret /etc/secrets=monitoring-apikey ``` -* Submit a Code Engine cron subscription that triggers the metrics collector every minute to query the Metrics API -``` -$ ibmcloud ce subscription cron create \ - --name collect-metrics-every-minute \ - --destination-type job \ - --destination metrics-collector \ - --schedule '*/1 * * * *' +**Step 4: Submit a job run** +```bash +ibmcloud ce jobrun submit \ + --job metrics-collector ``` -## Configuration +**Step 5: Setup the Cloud Monitoring dashboard as decribed [here](setup/ibm-cloud-monitoring/README.md)** + +### How It Works + +1. The metrics collector exposes Prometheus metrics on `localhost:9100/metrics` +2. The embedded Prometheus agent scrapes these metrics every 15 seconds +3. The agent also discovers and scrapes pods with the `codeengine.cloud.ibm.com/userMetricsScrape: 'true'` annotation +4. All metrics are forwarded to IBM Cloud Monitoring via remote write +5. If either the collector or Prometheus agent crashes, the container exits with a non-zero code to trigger a restart + +### Required Environment Variables for Prometheus Integration + +- **`METRICS_ENABLED=true`**: Enables the Prometheus agent +- **`METRICS_REMOTE_WRITE_FQDN`**: IBM Cloud Monitoring ingestion endpoint FQDN (required when `METRICS_ENABLED=true`) +- **Secret Mount**: `/etc/secrets/monitoring-apikey` must contain your IBM Cloud Monitoring API key + +### Troubleshooting + +If the container fails to start with `METRICS_ENABLED=true`, check the logs for: +- Missing `/etc/secrets/monitoring-apikey` file +- Missing `METRICS_REMOTE_WRITE_FQDN` environment variable + +### Configuration + +Per default the metrics collector collects memory and CPU statistics, like `usage`, `current` and `configured`. + +#### Environment Variables + +- **`INTERVAL`** (default: `30`): Collection interval in seconds (minimum 30 seconds). Controls how frequently metrics are collected in daemon mode. +- **`COLLECT_DISKUSAGE`** (default: `false`): Set to `true` to collect disk space usage. Note: The metrics collector calculates the overall file size stored in the pod's filesystem, which includes files from the container image, ephemeral storage, and mounted COS buckets. This metric cannot be used to calculate ephemeral storage usage alone. +- **`METRICS_ENABLED`** (default: `false`): Set to `true` to enable the HTTP metrics server. When disabled, the collector still runs and logs metrics to stdout but does not expose the HTTP endpoint. +- **`METRICS_PORT`** (default: `9100`): HTTP server port for the Prometheus metrics endpoint. Only used when `METRICS_ENABLED=true` in daemon mode. -Per default the metrics collector collects memory and CPU statistics, like `usage`, `current` and `configured`. +### Prometheus Metrics Endpoint -One can use the environment variable `COLLECT_DISKUSAGE=true` to also collect the amount of disk space that is used. Please note, the metrics collector can only calculate the overall file size stored in the pods filesystem which includes files that are part of the container image, the epheremal storage as well as mounted COS buckets. Hence, this metric cannot be used to calculate the ephemeral storage usage. +When running in **daemon mode** with **`METRICS_ENABLED=true`**, the metrics collector exposes an HTTP server on port 9100 (configurable via `METRICS_PORT`) with a `/metrics` endpoint that provides Prometheus-compatible metrics. + +**Note**: The HTTP server is only started when `METRICS_ENABLED=true`. When disabled, the collector continues to run and log metrics to stdout in JSON format, but does not expose the HTTP endpoint. + +#### Accessing the Metrics Endpoint + +The metrics endpoint is available at `http://:9100/metrics` and can be scraped by Prometheus or accessed directly. + +#### Exposed Metrics + +The following Prometheus metrics are exposed as gauges: + +Container Metrics: +- **`ibm_codeengine_instance_cpu_usage_millicores`**: Current CPU usage in millicores +- **`ibm_codeengine_instance_cpu_limit_millicores`**: Configured CPU limit in millicores +- **`ibm_codeengine_instance_memory_usage_bytes`**: Current memory usage in bytes +- **`ibm_codeengine_instance_memory_limit_bytes`**: Configured memory limit in bytes +- **`ibm_codeengine_instance_ephemeral_storage_usage_bytes`**: Current ephemeral storage usage in bytes (if `COLLECT_DISKUSAGE=true`) + +The following 3 metrics are used to monitor the collector itself: +- **`ibm_codeengine_collector_collection_duration_seconds`**: Time taken to collect metrics in seconds (if `METRICS_INTERNAL_STATS=true`) +- **`ibm_codeengine_collector_last_collection_timestamp_seconds`**: Unix timestamp of last successful collection (if `METRICS_INTERNAL_STATS=true`) +- **`ibm_codeengine_collector_collection_errors_total`**: Total number of collection errors (counter) (if `METRICS_INTERNAL_STATS=true`) + +#### Metric Labels + +All container metrics include the following labels: +- `instance_name`: Name of the pod instance +- `component_type`: Type of component (`app`, `job`, or `build`) +- `component_name`: Name of the Code Engine component + +#### Example Metrics Output + +```prometheus +# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores +# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge +ibm_codeengine_instance_cpu_usage_millicores{pod_name="myapp-00001-deployment-abc123",component_type="app",component_name="myapp"} 250 + +# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes +# TYPE ibm_codeengine_instance_memory_usage_bytes gauge +ibm_codeengine_instance_memory_usage_bytes{pod_name="myapp-00001-deployment-abc123",component_type="app",component_name="myapp"} 134217728 +``` + +#### Prometheus Scrape Configuration + +**Note**: The HTTP server is only started when `METRICS_ENABLED=true` and running in daemon mode (`JOB_MODE != "task"`). In task mode, metrics are collected once and logged to stdout without starting the HTTP server. When `METRICS_ENABLED` is not set to `true`, the collector runs in daemon mode but only logs metrics to stdout without exposing the HTTP endpoint. ## IBM Cloud Logs setup @@ -71,7 +174,7 @@ Follow the steps below to create a custom dashboard in your IBM Cloud Logs insta ![New dashboard](./images/icl-dashboard-new.png) -* In the "Import" modal, select the file [./setup/dashboard-code_engine_resource_consumption_metrics.json](./setup/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" +* In the "Import" modal, select the file [./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json](./setup/ibm-cloud-logs/dashboard-code_engine_resource_consumption_metrics.json) located in this repository, and click "Import" ![Import modal](./images/icl-dashboard-import.png) @@ -131,22 +234,6 @@ app:"codeengine" AND message.metric:"instance-resources" ![Logs overview](./images/icl-logs-view-overview.png) - -## IBM Log Analysis setup (deprecated) - -### Log lines - -Along with a human readable message, like `Captured metrics of app instance 'load-generator-00001-deployment-677d5b7754-ktcf6': 3m vCPU, 109 MB memory, 50 MB ephemeral storage`, each log line passes specific resource utilization details in a structured way allowing to apply advanced filters on them. - -E.g. -- `cpu.usage:>80`: Filter for all log lines that noticed a CPU utilization of 80% or higher -- `memory.current:>1000`: Filter for all log lines that noticed an instance that used 1GB or higher of memory -- `component_type:app`: Filter only for app instances. Possible values are `app`, `job`, and `build` -- `component_name:`: Filter for all instances of a specific app, job, or build -- `name:`: Filter for a specific instance - -![IBM Cloud Logs](./images/ibm-cloud-logs--loglines.png) - ### Log graphs Best is to create IBM Cloud Logs Board, in order to visualize the CPU and Memory usage per Code Engine component. @@ -170,14 +257,11 @@ Best is to create IBM Cloud Logs Board, in order to visualize the CPU and Memory - The resulting graph will render the actual CPU usage compared to the configured limit. The the unit is milli vCPUs (1000 -> 1 vCPU). ![](./images/cpu-utilization.png) - #### Add memory utilization 1. Duplicate the graph, change its name to Memory and replace its plots with `memory.configured` and `memory.current`. 1. The resulting graph will render the actual memory usage compared to the configured limit. The the unit is MB (1000 -> 1 GB). ![](./images/memory-utilization.png) - - #### Add disk utilization 1. Duplicate the graph or create a new one, change its name to "Disk usage" and replace its plots with `disk_usage.current`. 1. The resulting graph will render the actual disk usage. While this does not allow to identify the usage of disk space compared with the configured ephemeral storage limit, this graph gives an impression on whether the disk usage is growing over time. The the unit is MB (1000 -> 1 GB). diff --git a/metrics-collector/go.mod b/metrics-collector/go.mod index b816d4e37..e358afe92 100644 --- a/metrics-collector/go.mod +++ b/metrics-collector/go.mod @@ -1,6 +1,6 @@ module metrics-collector -go 1.23.0 +go 1.25.0 require ( k8s.io/api v0.30.1 @@ -31,7 +31,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect - golang.org/x/net v0.38.0 // indirect + golang.org/x/net v0.23.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect golang.org/x/sys v0.31.0 // indirect golang.org/x/term v0.30.0 // indirect @@ -42,7 +42,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.120.1 // indirect - k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f // indirect + k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 // indirect k8s.io/utils v0.0.0-20240502163921-fe8a2dddb1d0 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect diff --git a/metrics-collector/go.sum b/metrics-collector/go.sum index 6f6389a87..f2758c0dc 100644 --- a/metrics-collector/go.sum +++ b/metrics-collector/go.sum @@ -14,8 +14,7 @@ github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDsl github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= -github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= -github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= @@ -28,8 +27,8 @@ github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeN github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= -github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6 h1:k7nVchz72niMH6YLQNvHSdIE7iqsQxK1P41mySCvssg= -github.com/google/pprof v0.0.0-20240424215950-a892ee059fd6/go.mod h1:kf6iHlnVGwgKolg33glAes7Yg/8iWP8ukqeldJSO7jw= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= +github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= @@ -58,10 +57,10 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= -github.com/onsi/ginkgo/v2 v2.17.2 h1:7eMhcy3GimbsA3hEnVKdw/PQM9XN9krpKVXsZdph0/g= -github.com/onsi/ginkgo/v2 v2.17.2/go.mod h1:nP2DPOQoNsQmsVyv5rDA8JkXQoCs6goXIvr/PRJ1eCc= -github.com/onsi/gomega v1.33.1 h1:dsYjIxxSR755MDmKVsaFQTE22ChNBcuuTWgkUDSubOk= -github.com/onsi/gomega v1.33.1/go.mod h1:U4R44UsT+9eLIaYRB2a5qajjtQYn0hauxvRm16AVYg0= +github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY= +github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM= +github.com/onsi/gomega v1.31.0 h1:54UJxxj6cPInHS3a35wm6BK/F9nHYueZ1NVujHDrnXE= +github.com/onsi/gomega v1.31.0/go.mod h1:DW9aCi7U6Yi40wNVAvT6kzFnEVEI5n3DloYBiKiT6zk= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= @@ -83,8 +82,8 @@ golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= -golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= -golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -133,8 +132,8 @@ k8s.io/client-go v0.30.1 h1:uC/Ir6A3R46wdkgCV3vbLyNOYyCJ8oZnjtJGKfytl/Q= k8s.io/client-go v0.30.1/go.mod h1:wrAqLNs2trwiCH/wxxmT/x3hKVH9PuV0GGW0oDoHVqc= k8s.io/klog/v2 v2.120.1 h1:QXU6cPEOIslTGvZaXvFWiP9VKyeet3sawzTOvdXb4Vw= k8s.io/klog/v2 v2.120.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f h1:0LQagt0gDpKqvIkAMPaRGcXawNMouPECM1+F9BVxEaM= -k8s.io/kube-openapi v0.0.0-20240430033511-f0e62f92d13f/go.mod h1:S9tOR0FxgyusSNR+MboCuiDpVWkAifZvaYI1Q2ubgro= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340 h1:BZqlfIlq5YbRMFko6/PM7FjZpUb45WallggurYhKGag= +k8s.io/kube-openapi v0.0.0-20240228011516-70dd3763d340/go.mod h1:yD4MZYeKMBwQKVht279WycxKyM84kkAx2DPrTXaeb98= k8s.io/kubectl v0.30.1 h1:sHFIRI3oP0FFZmBAVEE8ErjnTyXDPkBcvO88mH9RjuY= k8s.io/kubectl v0.30.1/go.mod h1:7j+L0Cc38RYEcx+WH3y44jRBe1Q1jxdGPKkX0h4iDq0= k8s.io/metrics v0.30.1 h1:PeA9cP0kxVtaC8Wkzp4sTkr7YSkd9R0UYP6cCHOOY1M= diff --git a/metrics-collector/images/monitoring-dashboard-ce-component-resources.png b/metrics-collector/images/monitoring-dashboard-ce-component-resources.png new file mode 100644 index 000000000..c6b2a282f Binary files /dev/null and b/metrics-collector/images/monitoring-dashboard-ce-component-resources.png differ diff --git a/metrics-collector/main.go b/metrics-collector/main.go index 63a2ba867..27ca1e072 100644 --- a/metrics-collector/main.go +++ b/metrics-collector/main.go @@ -5,10 +5,14 @@ import ( "context" "encoding/json" "fmt" + "net/http" "os" + "os/signal" "strconv" "strings" "sync" + "sync/atomic" + "syscall" "time" v1 "k8s.io/api/core/v1" @@ -23,13 +27,180 @@ import ( metricsv "k8s.io/metrics/pkg/client/clientset/versioned" ) -func main() { +// MetricsCache holds the latest collected metrics in a thread-safe manner +type MetricsCache struct { + mu sync.RWMutex + metrics []InstanceResourceStats + namespace string + lastUpdate time.Time + collectionCount int64 + errorCount int64 +} + +// CollectorStats tracks collector performance metrics +type CollectorStats struct { + lastCollectionDuration atomic.Int64 // in milliseconds + lastCollectionTime atomic.Int64 // unix timestamp + totalErrors atomic.Int64 +} + +var ( + metricsCache = &MetricsCache{} + collectorStats = &CollectorStats{} +) + +// setupHTTPHandlers configures the HTTP routes +func setupHTTPHandlers() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/metrics", metricsHandler) + return mux +} + +// metricsHandler serves Prometheus-formatted metrics +func metricsHandler(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "Method not allowed", http.StatusMethodNotAllowed) + return + } + + metricsCache.mu.RLock() + metrics := metricsCache.metrics + namespace := metricsCache.namespace + lastUpdate := metricsCache.lastUpdate + metricsCache.mu.RUnlock() + + // Set content type for Prometheus + w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + + // Write Prometheus metrics + output := formatPrometheusMetrics(metrics, namespace, lastUpdate) + w.Write([]byte(output)) +} + +// formatPrometheusMetrics converts metrics to Prometheus text format +func formatPrometheusMetrics(metrics []InstanceResourceStats, namespace string, lastUpdate time.Time) string { + var sb strings.Builder + + // Helper function to escape label values + escapeLabelValue := func(s string) string { + s = strings.ReplaceAll(s, "\\", "\\\\") + s = strings.ReplaceAll(s, "\"", "\\\"") + s = strings.ReplaceAll(s, "\n", "\\n") + return s + } + + // Write container CPU usage metrics + sb.WriteString("# HELP ibm_codeengine_instance_cpu_usage_millicores Current CPU usage in millicores\n") + sb.WriteString("# TYPE ibm_codeengine_instance_cpu_usage_millicores gauge\n") + for _, m := range metrics { + labels := fmt.Sprintf("instance_name=\"%s\",component_type=\"%s\",component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_cpu_usage_millicores{%s} %d\n", labels, m.Cpu.Current)) + } + sb.WriteString("\n") + + // Write container CPU limit metrics + sb.WriteString("# HELP ibm_codeengine_instance_cpu_limit_millicores Configured CPU limit in millicores\n") + sb.WriteString("# TYPE ibm_codeengine_instance_cpu_limit_millicores gauge\n") + for _, m := range metrics { + if m.Cpu.Configured > 0 { + labels := fmt.Sprintf("instance_name=\"%s\",component_type=\"%s\",component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_cpu_limit_millicores{%s} %d\n", labels, m.Cpu.Configured)) + } + } + sb.WriteString("\n") + + // Write container memory usage metrics + sb.WriteString("# HELP ibm_codeengine_instance_memory_usage_bytes Current memory usage in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_memory_usage_bytes gauge\n") + for _, m := range metrics { + labels := fmt.Sprintf("instance_name=\"%s\",component_type=\"%s\",component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_memory_usage_bytes{%s} %d\n", labels, m.Memory.Current*1000*1000)) + } + sb.WriteString("\n") + + // Write container memory limit metrics + sb.WriteString("# HELP ibm_codeengine_instance_memory_limit_bytes Configured memory limit in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_memory_limit_bytes gauge\n") + for _, m := range metrics { + if m.Memory.Configured > 0 { + labels := fmt.Sprintf("instance_name=\"%s\",component_type=\"%s\",component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_memory_limit_bytes{%s} %d\n", labels, m.Memory.Configured*1000*1000)) + } + } + sb.WriteString("\n") + + // Write container ephemeral storage usage metrics (if available) + hasStorageMetrics := false + for _, m := range metrics { + if m.DiskUsage.Current > 0 { + hasStorageMetrics = true + break + } + } + if hasStorageMetrics { + sb.WriteString("# HELP ibm_codeengine_instance_ephemeral_storage_usage_bytes Current ephemeral storage usage in bytes\n") + sb.WriteString("# TYPE ibm_codeengine_instance_ephemeral_storage_usage_bytes gauge\n") + for _, m := range metrics { + if m.DiskUsage.Current > 0 { + labels := fmt.Sprintf("instance_name=\"%s\",component_type=\"%s\",component_name=\"%s\"", + escapeLabelValue(m.Name), + escapeLabelValue(m.ComponentType), + escapeLabelValue(m.ComponentName)) + // Convert MB to bytes + sb.WriteString(fmt.Sprintf("ibm_codeengine_instance_ephemeral_storage_usage_bytes{%s} %d\n", labels, m.DiskUsage.Current*1000*1000)) + } + } + sb.WriteString("\n") + } + + if os.Getenv("METRICS_INTERNAL_STATS") == "true" { + // Write collector self-monitoring metrics + sb.WriteString("# HELP codeengine_collector_collection_duration_seconds Time taken to collect metrics in seconds\n") + sb.WriteString("# TYPE codeengine_collector_collection_duration_seconds gauge\n") + durationMs := collectorStats.lastCollectionDuration.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_collection_duration_seconds %.3f\n", float64(durationMs)/1000.0)) + sb.WriteString("\n") + + sb.WriteString("# HELP codeengine_collector_last_collection_timestamp_seconds Unix timestamp of last successful collection\n") + sb.WriteString("# TYPE codeengine_collector_last_collection_timestamp_seconds gauge\n") + lastCollectionTime := collectorStats.lastCollectionTime.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_last_collection_timestamp_seconds %d\n", lastCollectionTime)) + sb.WriteString("\n") + + sb.WriteString("# HELP codeengine_collector_collection_errors_total Total number of collection errors\n") + sb.WriteString("# TYPE codeengine_collector_collection_errors_total counter\n") + totalErrors := collectorStats.totalErrors.Load() + sb.WriteString(fmt.Sprintf("codeengine_collector_collection_errors_total %d\n", totalErrors)) + sb.WriteString("\n") + } + + return sb.String() +} + +func main() { jobMode := os.Getenv("JOB_MODE") // In task mode, collect the resource metrics once if jobMode == "task" { - collectInstanceMetrics() + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + os.Exit(1) + } return } @@ -42,11 +213,105 @@ func main() { } } - // In daemon mode, collect resource metrics in an endless loop - for { - collectInstanceMetrics() - time.Sleep(time.Duration(sleepDuration) * time.Second) + // Check if HTTP metrics server should be enabled + metricsEnabled := os.Getenv("METRICS_ENABLED") == "true" + + // Get metrics port configuration + metricsPort := "9100" + if port := os.Getenv("METRICS_PORT"); port != "" { + metricsPort = port + } + + // Create context for graceful shutdown + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Setup signal handling for graceful shutdown + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // Start HTTP server only if METRICS_ENABLED=true + var server *http.Server + var serverErrors chan error + + if metricsEnabled { + server = &http.Server{ + Addr: ":" + metricsPort, + Handler: setupHTTPHandlers(), + } + + // Start HTTP server in a goroutine + serverErrors = make(chan error, 1) + go func() { + fmt.Printf("Starting HTTP metrics server on port %s\n", metricsPort) + if err := server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + serverErrors <- fmt.Errorf("HTTP server error: %w", err) + } + }() + } else { + fmt.Println("HTTP metrics server disabled (METRICS_ENABLED not set to 'true')") + } + + // Start metrics collection loop in a goroutine + collectionDone := make(chan struct{}) + go func() { + defer close(collectionDone) + ticker := time.NewTicker(time.Duration(sleepDuration) * time.Second) + defer ticker.Stop() + + // Collect metrics immediately on startup + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + collectorStats.totalErrors.Add(1) + } + + for { + select { + case <-ctx.Done(): + fmt.Println("Stopping metrics collection...") + return + case <-ticker.C: + if err := collectInstanceMetrics(metricsCache); err != nil { + fmt.Printf("Error collecting metrics: %v\n", err) + collectorStats.totalErrors.Add(1) + } + } + } + }() + + // Wait for shutdown signal or server error + if metricsEnabled { + select { + case sig := <-sigChan: + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) + case err := <-serverErrors: + fmt.Printf("Server error: %v\n", err) + } + } else { + // If server is not running, just wait for signal + sig := <-sigChan + fmt.Printf("\nReceived signal %v, initiating graceful shutdown...\n", sig) } + + // Cancel context to stop metrics collection + cancel() + + // Shutdown HTTP server with timeout (only if it was started) + if metricsEnabled && server != nil { + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer shutdownCancel() + + if err := server.Shutdown(shutdownCtx); err != nil { + fmt.Printf("HTTP server shutdown error: %v\n", err) + } else { + fmt.Println("HTTP server stopped gracefully") + } + } + + // Wait for metrics collection to finish + <-collectionDone + fmt.Println("Metrics collection stopped") + fmt.Println("Shutdown complete") } type ComponentType int64 @@ -88,28 +353,125 @@ type InstanceResourceStats struct { Message string `json:"message"` } +// buildPodMap creates a map of pod names to pod objects for O(1) lookup +func buildPodMap(pods *[]v1.Pod) map[string]*v1.Pod { + podMap := make(map[string]*v1.Pod, len(*pods)) + for i := range *pods { + podMap[(*pods)[i].Name] = &(*pods)[i] + } + return podMap +} + +// extractComponentMetadata extracts component type, name, and parent from pod metric labels +func extractComponentMetadata(podMetric *v1beta1.PodMetrics) (componentType ComponentType, componentName, parent string) { + componentType = determineComponentType(podMetric) + + switch componentType { + case Job: + if val, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-definition-name"]; ok { + componentName = val + } else { + componentName = "standalone" + } + parent = podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"] + case App: + componentName = podMetric.ObjectMeta.Labels["serving.knative.dev/service"] + parent = podMetric.ObjectMeta.Labels["serving.knative.dev/revision"] + case Build: + if val, ok := podMetric.ObjectMeta.Labels["build.shipwright.io/name"]; ok { + componentName = val + } else { + componentName = "standalone" + } + parent = podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"] + default: + componentName = "unknown" + } + + return +} + +// processMetric processes a single pod metric and outputs the JSON log line +func processMetric( + podMetric *v1beta1.PodMetrics, + podMap map[string]*v1.Pod, + clientset *kubernetes.Clientset, + namespace string, + config *rest.Config, +) *InstanceResourceStats { + // Extract component metadata + componentType, componentName, parent := extractComponentMetadata(podMetric) + + // Determine the actual CPU and memory usage + cpuCurrent := podMetric.Containers[0].Usage.Cpu().ToDec().AsApproximateFloat64() * 1000 + memoryCurrent := podMetric.Containers[0].Usage.Memory().ToDec().AsApproximateFloat64() / 1000 / 1000 + + stats := InstanceResourceStats{ + Metric: "instance-resources", + Name: podMetric.Name, + Parent: parent, + ComponentType: componentType.String(), + ComponentName: componentName, + Cpu: ResourceStats{ + Current: int64(cpuCurrent), + }, + Memory: ResourceStats{ + Current: int64(memoryCurrent), + }, + } + + // Gather the configured resource limits and calculate the usage (in percent) + pod := podMap[podMetric.Name] + if pod != nil { + userContainerName := getUserContainerName(componentType, pod) + + // determine the actual disk usage + storageCurrent := obtainDiskUsage(clientset, namespace, podMetric.Name, userContainerName, config) + stats.DiskUsage.Current = int64(storageCurrent) + + // extract memory and cpu limits + cpu, memory := getCpuAndMemoryLimits(userContainerName, pod) + + cpuLimit := cpu.ToDec().AsApproximateFloat64() * 1000 + stats.Cpu.Configured = int64(cpuLimit) + stats.Cpu.Usage = int64((cpuCurrent / cpuLimit) * 100) + + memoryLimit := memory.ToDec().AsApproximateFloat64() / 1000 / 1000 + stats.Memory.Configured = int64(memoryLimit) + stats.Memory.Usage = int64(memoryCurrent / memoryLimit * 100) + } + + // Compose the log line message + stats.Message = "Captured metrics of " + stats.ComponentType + " instance '" + stats.Name + "': " + fmt.Sprintf("%d", stats.Cpu.Current) + "m vCPU, " + fmt.Sprintf("%d", stats.Memory.Current) + " MB memory, " + fmt.Sprintf("%d", stats.DiskUsage.Current) + " MB disk usage" + + // Write the stringified JSON struct and make use of IBM Cloud Logs built-in parsing mechanism, + // which allows to annotate log lines by providing a JSON object instead of a simple string + fmt.Println(ToJSONString(&stats)) + + return &stats +} + // Helper function that retrieves all pods and all pod metrics // this function creates a structured log line for each pod for which the kube metrics api provides a metric -func collectInstanceMetrics() { - +func collectInstanceMetrics(cache *MetricsCache) error { startTime := time.Now() fmt.Println("Start to capture pod metrics ...") config, err := rest.InClusterConfig() if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to get cluster config: %w", err) } // obtain the kube namespace related to this Code Engine project nsBytes, err := os.ReadFile("/var/run/secrets/kubernetes.io/serviceaccount/namespace") if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to read namespace: %w", err) } namespace := string(nsBytes) coreClientset, err := kubernetes.NewForConfig(config) if err != nil { - panic(err.Error()) + return fmt.Errorf("failed to create clientset: %w", err) } // fetches all pods @@ -118,96 +480,53 @@ func collectInstanceMetrics() { // fetch all pod metrics podMetrics := getAllPodMetrics(namespace, config) + // Build pod map for O(1) lookup + podMap := buildPodMap(pods) + + // Collect metrics into a slice + var collectedMetrics []InstanceResourceStats + var metricsMu sync.Mutex + + // Use semaphore to limit concurrent goroutines + const maxConcurrency = 20 + sem := make(chan struct{}, maxConcurrency) var wg sync.WaitGroup for _, metric := range *podMetrics { wg.Add(1) + sem <- struct{}{} // Acquire semaphore go func(podMetric *v1beta1.PodMetrics) { defer wg.Done() + defer func() { <-sem }() // Release semaphore - // Determine the component type (either app, job, build or unknown) - componentType := determineComponentType(podMetric) - - // Determine the component name - var componentName string - var parent string - switch componentType { - case Job: - if val, ok := podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-definition-name"]; ok { - componentName = val - } else { - componentName = "standalone" - } - parent = podMetric.ObjectMeta.Labels["codeengine.cloud.ibm.com/job-run"] - case App: - componentName = podMetric.ObjectMeta.Labels["serving.knative.dev/service"] - parent = podMetric.ObjectMeta.Labels["serving.knative.dev/revision"] - case Build: - if val, ok := podMetric.ObjectMeta.Labels["build.shipwright.io/name"]; ok { - componentName = val - } else { - componentName = "standalone" - } - - parent = podMetric.ObjectMeta.Labels["buildrun.shipwright.io/name"] - default: - componentName = "unknown" - } - - // Determine the actual CPU and memory usage - cpuCurrent := podMetric.Containers[0].Usage.Cpu().ToDec().AsApproximateFloat64() * 1000 - memoryCurrent := podMetric.Containers[0].Usage.Memory().ToDec().AsApproximateFloat64() / 1000 / 1000 - - stats := InstanceResourceStats{ - Metric: "instance-resources", - Name: podMetric.Name, - Parent: parent, - ComponentType: componentType.String(), - ComponentName: componentName, - Cpu: ResourceStats{ - Current: int64(cpuCurrent), - }, - Memory: ResourceStats{ - Current: int64(memoryCurrent), - }, + stats := processMetric(podMetric, podMap, coreClientset, namespace, config) + if stats != nil { + metricsMu.Lock() + collectedMetrics = append(collectedMetrics, *stats) + metricsMu.Unlock() } - - // Gather the configured resource limits and calculate the usage (in percent) - pod := getPod(podMetric.Name, pods) - if pod != nil { - - userContainerName := getUserContainerName(componentType, pod) - - // determine the actual disk usage - storageCurrent := obtainDiskUsage(coreClientset, namespace, podMetric.Name, userContainerName, config) - stats.DiskUsage.Current = int64(storageCurrent) - - // extract memory and cpu limits - cpu, memory := getCpuAndMemoryLimits(userContainerName, pod) - - cpuLimit := cpu.ToDec().AsApproximateFloat64() * 1000 - stats.Cpu.Configured = int64(cpuLimit) - stats.Cpu.Usage = int64((cpuCurrent / cpuLimit) * 100) - - memoryLimit := memory.ToDec().AsApproximateFloat64() / 1000 / 1000 - stats.Memory.Configured = int64(memoryLimit) - stats.Memory.Usage = int64(memoryCurrent / memoryLimit * 100) - } - - // Compose the log line message - stats.Message = "Captured metrics of " + stats.ComponentType + " instance '" + stats.Name + "': " + fmt.Sprintf("%d", stats.Cpu.Current) + "m vCPU, " + fmt.Sprintf("%d", stats.Memory.Current) + " MB memory, " + fmt.Sprintf("%d", stats.DiskUsage.Current) + " MB disk usage" - - // Write the stringified JSON struct and make use of IBM Cloud Logs built-in parsing mechanism, - // which allows to annotate log lines by providing a JSON object instead of a simple string - fmt.Println(ToJSONString(&stats)) - }(&metric) } wg.Wait() - fmt.Println("Captured pod metrics in " + strconv.FormatInt(time.Since(startTime).Milliseconds(), 10) + " ms") + duration := time.Since(startTime) + fmt.Println("Captured pod metrics in " + strconv.FormatInt(duration.Milliseconds(), 10) + " ms") + + // Update cache with collected metrics + cache.mu.Lock() + cache.metrics = collectedMetrics + cache.namespace = namespace + cache.lastUpdate = time.Now() + cache.collectionCount++ + cache.mu.Unlock() + + // Update collector statistics + collectorStats.lastCollectionDuration.Store(duration.Milliseconds()) + collectorStats.lastCollectionTime.Store(time.Now().Unix()) + + return nil } // Helper function to determine the component type diff --git a/metrics-collector/prometheus.yml.template b/metrics-collector/prometheus.yml.template new file mode 100644 index 000000000..991021971 --- /dev/null +++ b/metrics-collector/prometheus.yml.template @@ -0,0 +1,97 @@ +global: + scrape_interval: 30s + external_labels: + code_engine_project_name: 'my-project-name' + +scrape_configs: + - job_name: 'codeengine-metrics-collector' + static_configs: + - targets: ['localhost:9100'] + relabel_configs: + # Add project name label + - source_labels: [job] + action: replace + regex: (.+) + replacement: 'my-project-name' + target_label: code_engine_project_name + + - job_name: 'codeengine-metrics-usermetrics' + fallback_scrape_protocol: PrometheusText0.0.4 + kubernetes_sd_configs: + - api_server: 'https://172.21.0.1' + role: pod + namespaces: + names: + - ${CE_SUBDOMAIN} + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + relabel_configs: + # only scrape when annotation codeengine.cloud.ibm.com/userMetricsScrape: 'true' is set + - source_labels: [__meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsScrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsPath] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_codeengine_cloud_ibm_com_userMetricsPort] + action: replace + regex: (.+):(?:\d+);(\d+) + replacement: ${1}:${2} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + # rename important meta data labels + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: code_engine_project_namespace + - source_labels: [job] + action: replace + regex: (.+) + replacement: 'my-project-name' + target_label: code_engine_project_name + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: code_engine_instance_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_service] + action: replace + target_label: code_engine_component_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_configuration] + action: replace + regex: (.+) + replacement: app + target_label: code_engine_component_type + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_revision] + action: replace + target_label: code_engine_subcomponent_name + - source_labels: [__meta_kubernetes_pod_label_serving_knative_dev_revisionUID] + action: replace + regex: (.+) + replacement: app_revision + target_label: code_engine_subcomponent_type + # drop codeengine, istio, and knative labels + - action: labeldrop + regex: "codeengine_cloud_ibm_com_(.+)" + - action: labeldrop + regex: "security_istio_io_(.+)" + - action: labeldrop + regex: "service_istio_io_(.+)" + - action: labeldrop + regex: "serving_knative_dev_(.+)" + # drop default prometheus labels + - action: labeldrop + regex: "instance" + - action: labeldrop + regex: "job" + - action: labeldrop + regex: "pod_template_hash" + - action: labeldrop + regex: "app" +# +# Define IBM Cloud Monitoring as the remote write target +# +remote_write: +- url: https://${METRICS_REMOTE_WRITE_FQDN}/prometheus/remote/write + authorization: + credentials_file: "/etc/secrets/monitoring-apikey" \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/README.md b/metrics-collector/setup/ibm-cloud-monitoring/README.md new file mode 100644 index 000000000..829e883cf --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/README.md @@ -0,0 +1,242 @@ +# IBM Cloud Monitoring Dashboard Setup + +This directory contains tools and dashboards for IBM Cloud Monitoring (Sysdig) integration. + +## Files + +- **`import_dashboard.py`**: Python script to create or update Sysdig dashboards +- **`code-engine-component-resource-overview.json`**: Dashboard configuration for Code Engine resource monitoring + +## Prerequisites + +1. **Python 3.6+** installed on your system + +2. **IBM Cloud Account** with: + - An IBM Cloud Monitoring (Sysdig) instance + - An IBM Cloud IAM API key with access to the Monitoring instance + - The Monitoring instance ID (GUID) + +3. **Metrics Data**: The dashboard expects metrics from the Code Engine metrics collector to be available in your Sysdig instance + +### Getting Your IBM Cloud Credentials + +**IBM Cloud IAM API Key:** +1. Log in to [IBM Cloud Console](https://cloud.ibm.com) +2. Go to **Manage** > **Access (IAM)** > **API keys** +3. Click **Create an IBM Cloud API key** +4. Give it a name and description +5. Copy and save the API key securely + +**Monitoring Instance ID:** +1. Navigate to your IBM Cloud Monitoring instance +2. Click on **Overview** or **Settings** +3. Copy the **Instance ID** (GUID format: `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`) + +**Region:** +- Note the region where your Monitoring instance is deployed (e.g., `us-south`, `eu-de`) + +## Setup + +### Using a Virtual Environment (Recommended) + +It's recommended to use a Python virtual environment to isolate dependencies: + +```bash +# Navigate to the setup directory +cd setup/ibm-cloud-monitoring + +# Create a virtual environment +python3 -m venv venv + +# Activate the virtual environment +# On macOS/Linux: +source venv/bin/activate +# On Windows: +# venv\Scripts\activate + +# Install required dependencies +pip install requests + +# You should now see (venv) in your terminal prompt +``` + +When you're done, deactivate the virtual environment: +```bash +deactivate +``` + +### Global Installation (Alternative) + +If you prefer to install dependencies globally: + +```bash +pip install requests +# or +pip3 install requests +``` + +## Usage + +### Import or Update Dashboard + +```bash +python import_dashboard.py \ + --iam-api-key YOUR_IBM_CLOUD_IAM_API_KEY \ + --instance-id YOUR_MONITORING_INSTANCE_ID \ + --region us-south \ + --dashboard code-engine-component-resource-overview.json +``` + +### Using Environment Variables + +```bash +export IBM_CLOUD_IAM_API_KEY=YOUR_IBM_CLOUD_IAM_API_KEY +export SYSDIG_INSTANCE_ID=YOUR_MONITORING_INSTANCE_ID +export SYSDIG_REGION=us-south +python import_dashboard.py --dashboard code-engine-component-resource-overview.json +``` + +### Supported Regions + +- `us-south` - US South (Dallas) +- `us-east` - US East (Washington DC) +- `eu-de` - EU Central (Frankfurt) +- `eu-gb` - EU GB (London) +- `jp-tok` - Japan (Tokyo) +- `au-syd` - Australia (Sydney) +- `jp-osa` - Japan (Osaka) +- `ca-tor` - Canada (Toronto) +- `br-sao` - Brazil (São Paulo) + +## Dashboard: Code Engine Container Resource Overview + +The `code-engine-component-resource-overview.json` dashboard provides comprehensive monitoring of Code Engine resources: + +### Panels + +1. **CPU Usage vs Limit (per Pod)** - Compares live CPU usage to configured limits +2. **CPU Utilization % (per App)** - CPU percentage by component +3. **Memory Usage vs Limit (per Pod)** - Compares memory usage to limits +4. **Memory Utilization % (per App)** - Memory percentage by component +5. **CPU Utilization % (per Namespace)** - Namespace-level CPU monitoring +6. **Memory Utilization % (per Namespace)** - Namespace-level memory monitoring +7. **CPU Utilization % (per Revision/Parent)** - Revision-level CPU tracking +8. **Memory Utilization % (per Revision/Parent)** - Revision-level memory tracking +9. **Top Pods by CPU** - Top 10 CPU consumers +10. **Top Pods by Memory** - Top 10 memory consumers +11. **Cluster CPU Utilization (%)** - Global CPU percentage +12. **Cluster Memory Utilization (%)** - Global memory percentage + +### Required Metrics + +The dashboard uses the following Prometheus metrics: + +- `codeengine_container_cpu_usage_millicores` +- `codeengine_container_cpu_limit_millicores` +- `codeengine_container_memory_usage_bytes` +- `codeengine_container_memory_limit_bytes` + +These metrics are exposed by the Code Engine metrics collector when running with `METRICS_ENABLED=true`. + +## Script Features + +The `import_dashboard.py` script: + +- ✅ Creates new dashboards if they don't exist +- ✅ Updates existing dashboards with the same name +- ✅ Validates API credentials and region +- ✅ Provides clear error messages +- ✅ Displays dashboard URL after creation/update + +## Troubleshooting + +### Authentication Errors + +If you get authentication errors, verify: +- Your IBM Cloud IAM API key is correct and not expired +- The IAM API key has permissions to access the Monitoring instance +- The Monitoring instance ID is correct +- You're using the correct region where the instance is deployed + +### Dashboard Not Showing Data + +If the dashboard shows no data: +- Verify the metrics collector is running with `METRICS_ENABLED=true` +- Check that metrics are being sent to IBM Cloud Monitoring +- Ensure the Prometheus remote write configuration is correct +- Wait a few minutes for data to appear (initial scrape interval) + +### Import Errors + +If the import fails: +- Check that the JSON file is valid +- Ensure you have network connectivity to IBM Cloud +- Verify the region endpoint is accessible + +## Example: Complete Setup with Virtual Environment + +```bash +# 1. Navigate to the setup directory +cd setup/ibm-cloud-monitoring + +# 2. Create and activate virtual environment +python3 -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate + +# 3. Install dependencies +pip install requests + +# 4. Set environment variables +export IBM_CLOUD_IAM_API_KEY=your-iam-api-key-here +export SYSDIG_INSTANCE_ID=your-instance-id-here +export SYSDIG_REGION=us-south + +# 5. Import the dashboard +python import_dashboard.py --dashboard code-engine-component-resource-overview.json + +# Output: +# Loading dashboard configuration from 'code-engine-component-resource-overview.json'... +# Obtaining IBM Cloud IAM access token... +# ✓ IAM access token obtained successfully +# Checking if dashboard 'IBM Code Engine - Container Resource Overview' exists... +# Dashboard 'IBM Code Engine - Container Resource Overview' not found. Creating new dashboard... +# ✓ Dashboard 'IBM Code Engine - Container Resource Overview' created successfully (ID: 12345)! +# +# Dashboard URL: https://us-south.monitoring.cloud.ibm.com/#/dashboards/12345 +# +# ✓ Operation completed successfully! + +# 6. Deactivate virtual environment when done +deactivate +``` + +## Example: Quick Run (Without Virtual Environment) + +```bash +# 1. Install dependencies globally +pip3 install requests + +# 2. Run the script +cd setup/ibm-cloud-monitoring +python3 import_dashboard.py \ + --iam-api-key your-iam-api-key-here \ + --instance-id your-instance-id-here \ + --region us-south \ + --dashboard code-engine-component-resource-overview.json +``` + +## Customizing Dashboards + +To customize the dashboard: + +1. Edit `code-engine-component-resource-overview.json` +2. Modify panel queries, layouts, or add new panels +3. Run the import script to update the dashboard + +The script will detect the existing dashboard by name and update it with your changes. + +## Additional Resources + +- [IBM Cloud Monitoring Documentation](https://cloud.ibm.com/docs/monitoring) +- [Sysdig Dashboard API](https://docs.sysdig.com/en/docs/developer-tools/sysdig-rest-api-conventions/) +- [PromQL Query Language](https://prometheus.io/docs/prometheus/latest/querying/basics/) \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json b/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json new file mode 100644 index 000000000..3e40ae81b --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/code-engine-component-resource-overview.json @@ -0,0 +1,1243 @@ +{ + "name": "IBM Cloud Code Engine - Component Resource Overview", + "panels": [ + { + "id": 1, + "type": "text", + "name": "Dashboard Overview", + "description": "", + "nullValueDisplayText": null, + "links": null, + "markdownSource": "Monitor CPU, Memory, and Disk storage usage across Code Engine components.\n\n**Use the scope filters above to narrow by:**\n- `component_type` (app, job, build)\n- `component_name` (specific app/job/build name)", + "transparentBackground": false, + "panelTitleVisible": true, + "textAutosized": false + }, + { + "id": 5, + "type": "advancedTimechart", + "name": "CPU Utilization % (by Component)", + "description": "CPU usage as percentage of limit, grouped by component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum by (component_name) (ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / sum by (component_name) (ibm_codeengine_instance_cpu_limit_millicores)) * 100", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": "{{component_name}}", + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "%", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "%", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 0.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 6, + "type": "advancedTimechart", + "name": "Memory Utilization % (by Component)", + "description": "Memory usage as percentage of limit, grouped by component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum by (component_name) (ibm_codeengine_instance_memory_usage_bytes{$__scope}) / sum by (component_name) (ibm_codeengine_instance_memory_limit_bytes{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": "{{component_name}}", + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "high", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 8, + "type": "advancedNumber", + "name": "Total CPU Utilization %", + "description": "Overall CPU utilization across all containers", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum(ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / sum(ibm_codeengine_instance_cpu_limit_millicores{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "medium", + "value": 90.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "none", + "value": 1.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 9, + "type": "advancedNumber", + "name": "Total Memory Utilization %", + "description": "Overall memory utilization across all containers", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "(sum(ibm_codeengine_instance_memory_usage_bytes{$__scope}) / sum(ibm_codeengine_instance_memory_limit_bytes{$__scope})) * 100", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "%", + "inputFormat": "0-100", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "medium", + "value": 80.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "ok", + "value": 30.0, + "inputFormat": "0-100", + "displayText": "" + }, + { + "severity": "none", + "value": 1.0, + "inputFormat": "0-100", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 7, + "type": "advancedTimechart", + "name": "Disk Storage Usage\u00a0(by Component)", + "description": "Storage usage grouped by component_name", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum by (component_name) (ibm_codeengine_instance_ephemeral_storage_usage_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Disk storage in use ", + "timeSeriesDisplayNameTemplate": "{{component_name}}", + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "B", + "maxInputFormat": "B", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 4, + "type": "advancedTimechart", + "name": "Disk Storage Usage (per Instance)", + "description": "Current ephemeral storage usage", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_ephemeral_storage_usage_bytes{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Disk Storage Usage", + "timeSeriesDisplayNameTemplate": "{{component_name}}/{{instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "B", + "maxInputFormat": "B", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 11, + "type": "advancedNumber", + "name": "Total Available CPUs", + "description": "Overall CPU usage across all components", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_cpu_limit_millicores{$__scope}) / 1000", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 12, + "type": "advancedNumber", + "name": "Total Available Memory", + "description": "Overall memory utilization across all containers", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_memory_limit_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 13, + "type": "advancedNumber", + "name": "Total Used CPUs", + "description": "Overall CPU usage across all components", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_cpu_usage_millicores{$__scope}) / 1000", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 14, + "type": "advancedNumber", + "name": "Total Used Memory", + "description": "Overall memory utilization across all containers", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "sum(ibm_codeengine_instance_memory_usage_bytes{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "Memory %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 1, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 2, + "type": "advancedTimechart", + "name": "CPU Usage (per Instance)", + "description": "Current CPU usage compared to configured limits", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_cpu_usage_millicores{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "CPU Usage", + "timeSeriesDisplayNameTemplate": "{{component_name}}/{{instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "0-100", + "maxInputFormat": "0-100", + "scale": "linear" + }, + "right": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 3, + "type": "advancedTimechart", + "name": "Memory Usage (per Instance)", + "description": "Current memory usage compared to configured limits", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "ibm_codeengine_instance_memory_usage_bytes{$__scope}", + "enabled": true, + "displayInfo": { + "displayName": "Memory Usage", + "timeSeriesDisplayNameTemplate": "{{component_name}}/{{instance_name}} usage", + "type": "lines" + }, + "format": { + "unit": "byte", + "inputFormat": "B", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "connectSolid", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "day" + } + } + ], + "legendConfiguration": { + "enabled": true, + "position": "right", + "layout": "table", + "showCurrent": true, + "showMax": true, + "showMin": true, + "width": null, + "height": null + }, + "axesConfiguration": { + "bottom": { + "enabled": true + }, + "left": { + "enabled": true, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "B", + "maxInputFormat": "B", + "scale": "linear" + }, + "right": { + "enabled": false, + "displayName": null, + "unit": "auto", + "displayFormat": "auto", + "decimals": null, + "minValue": 0.0, + "maxValue": null, + "minInputFormat": "1", + "maxInputFormat": "1", + "scale": "linear" + } + }, + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 15, + "type": "advancedNumber", + "name": "Components with Running Instances", + "description": "Number of Code Engine components that do have running instances", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "count(count(\n ibm_codeengine_instance_cpu_limit_millicores{$__scope}\n)by (component_name))", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": false, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [], + "base": { + "severity": "none", + "displayText": "" + } + } + }, + { + "id": 16, + "type": "advancedNumber", + "name": "Running Instances", + "description": "Overall CPU usage across all components", + "nullValueDisplayText": null, + "links": null, + "advancedQueries": [ + { + "query": "count by (code_engine_instance_name) (ibm_codeengine_instance_cpu_limit_millicores{$__scope})", + "enabled": true, + "displayInfo": { + "displayName": "CPU %", + "timeSeriesDisplayNameTemplate": null, + "type": "lines" + }, + "format": { + "unit": "number", + "inputFormat": "1", + "displayFormat": "auto", + "decimals": 0, + "yAxis": "auto", + "nullValueDisplayMode": "nullGap", + "minInterval": null + }, + "compareTo": { + "enabled": true, + "delta": 1, + "timeFormat": "hour" + } + } + ], + "numberThresholds": { + "useDefaults": null, + "values": [ + { + "severity": "info", + "value": 80.0, + "inputFormat": "1", + "displayText": "" + } + ], + "base": { + "severity": "none", + "displayText": "" + } + } + } + ], + "scopeExpressionList": [ + { + "operand": "code_engine_project_name", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "code_engine_project_name", + "id": "code_engine_project_name", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "code_engine_project_name", + "description": "code_engine_project_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "code_engine_project_name", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1770938413029 + }, + "variable": true, + "isVariable": true + }, + { + "operand": "component_type", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "component_type", + "id": "component_type", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "component_type", + "description": "component_type", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "component_type", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1770938413029 + }, + "variable": true, + "isVariable": true + }, + { + "operand": "component_name", + "operator": "in", + "displayName": "", + "value": [], + "descriptor": { + "documentId": "component_name", + "id": "component_name", + "metricType": "tag", + "type": "string", + "scale": 0.0, + "name": "component_name", + "description": "component_name", + "namespaces": [ + "cloudProvider", + "host.container", + "ecs", + "host.fs", + "host.file", + "host", + "kubernetes", + "kubernetes.cluster", + "kubernetes.daemonSet", + "kubernetes.deployment", + "kubernetes.job", + "kubernetes.namespace", + "kubernetes.node", + "kubernetes.pod", + "kubernetes.replicaSet", + "kubernetes.service", + "kubernetes.statefulSet", + "kubernetes.resourcequota", + "kubernetes.hpa", + "link", + "mesos", + "host.net", + "host.process", + "prometheus", + "swarm", + "prombeacon" + ], + "scopes": [], + "timeAggregations": [ + "concat", + "distinct", + "count" + ], + "groupAggregations": [ + "concat", + "distinct", + "count" + ], + "aggregationForGroup": "none", + "hidden": false, + "experimental": false, + "deferred": false, + "identity": false, + "canMonitor": false, + "canGroupBy": false, + "canFilter": true, + "generatedFrom": "com.draios.model.metrics.custom.CustomMetric$Tag", + "publicId": "component_name", + "heuristic": false, + "documentType": "metric", + "segment": false, + "documentTimestamp": 1770938413029 + }, + "variable": true, + "isVariable": true + } + ], + "eventDisplaySettings": { + "enabled": true, + "queryParams": { + "severities": [], + "alertStatuses": [], + "categories": [], + "filter": "", + "teamScope": false + } + }, + "shared": true, + "public": false, + "description": "Overview of Code Engine instance resource usage: CPU/memory current vs limits, with filtering by component_type and component_name", + "layout": [ + { + "panelId": 1, + "x": 0, + "y": 0, + "w": 7, + "h": 4 + }, + { + "panelId": 5, + "x": 0, + "y": 4, + "w": 12, + "h": 6 + }, + { + "panelId": 6, + "x": 0, + "y": 10, + "w": 12, + "h": 6 + }, + { + "panelId": 8, + "x": 21, + "y": 0, + "w": 3, + "h": 2 + }, + { + "panelId": 9, + "x": 21, + "y": 2, + "w": 3, + "h": 2 + }, + { + "panelId": 7, + "x": 0, + "y": 16, + "w": 12, + "h": 6 + }, + { + "panelId": 4, + "x": 12, + "y": 16, + "w": 12, + "h": 6 + }, + { + "panelId": 11, + "x": 14, + "y": 0, + "w": 4, + "h": 2 + }, + { + "panelId": 12, + "x": 14, + "y": 2, + "w": 4, + "h": 2 + }, + { + "panelId": 13, + "x": 18, + "y": 0, + "w": 3, + "h": 2 + }, + { + "panelId": 14, + "x": 18, + "y": 2, + "w": 3, + "h": 2 + }, + { + "panelId": 2, + "x": 12, + "y": 4, + "w": 12, + "h": 6 + }, + { + "panelId": 3, + "x": 12, + "y": 10, + "w": 12, + "h": 6 + }, + { + "panelId": 15, + "x": 7, + "y": 0, + "w": 4, + "h": 4 + }, + { + "panelId": 16, + "x": 11, + "y": 0, + "w": 3, + "h": 4 + } + ], + "schema": 3 +} \ No newline at end of file diff --git a/metrics-collector/setup/ibm-cloud-monitoring/export_dashboard.py b/metrics-collector/setup/ibm-cloud-monitoring/export_dashboard.py new file mode 100644 index 000000000..983965771 --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/export_dashboard.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +""" +IBM Cloud Monitoring Dashboard Export Script + +This script exports Sysdig dashboards from IBM Cloud Monitoring using IBM Cloud IAM authentication. +It uses an IBM Cloud IAM API key to obtain an access token, then retrieves the dashboard via the Sysdig API. + +Usage: + python export_dashboard.py --iam-api-key --instance-id --region --name + +Environment Variables: + IBM_CLOUD_IAM_API_KEY: IBM Cloud IAM API key (alternative to --iam-api-key) + SYSDIG_INSTANCE_ID: IBM Cloud Monitoring instance ID (alternative to --instance-id) + SYSDIG_REGION: IBM Cloud Monitoring region (alternative to --region) + +Example: + python export_dashboard.py \\ + --iam-api-key YOUR_IAM_API_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --name "IBM Cloud Code Engine - Component Resource Overview" +""" + +import argparse +import json +import os +import sys +from datetime import datetime +from typing import Dict, Optional + +try: + import requests +except ImportError: + print("Error: 'requests' module not found. Install it with: pip install requests") + sys.exit(1) + + +class IBMCloudIAMAuth: + """Handles IBM Cloud IAM authentication.""" + + IAM_TOKEN_URL = "https://iam.cloud.ibm.com/identity/token" + + def __init__(self, iam_api_key: str): + """ + Initialize IBM Cloud IAM authentication. + + Args: + iam_api_key: IBM Cloud IAM API key + """ + self.iam_api_key = iam_api_key + self._access_token = None + self._token_expiry = 0 + + def get_access_token(self) -> str: + """ + Get an IBM Cloud IAM access token. + + Returns: + IAM access token + """ + print("Obtaining IBM Cloud IAM access token...") + + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "application/json" + } + + data = { + "grant_type": "urn:ibm:params:oauth:grant-type:apikey", + "apikey": self.iam_api_key + } + + try: + response = requests.post( + self.IAM_TOKEN_URL, + headers=headers, + data=data, + timeout=30 + ) + response.raise_for_status() + token_data = response.json() + self._access_token = token_data.get("access_token") + + if not self._access_token: + raise ValueError("No access token in IAM response") + + print("✓ IAM access token obtained successfully") + return self._access_token + + except requests.exceptions.RequestException as e: + print(f"Error obtaining IAM token: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + +class SysdigDashboardManager: + """Manages Sysdig dashboard operations via REST API with IBM Cloud IAM authentication.""" + + # IBM Cloud Monitoring regional endpoints + REGION_ENDPOINTS = { + "us-south": "https://us-south.monitoring.cloud.ibm.com", + "us-east": "https://us-east.monitoring.cloud.ibm.com", + "eu-de": "https://eu-de.monitoring.cloud.ibm.com", + "eu-es": "https://eu-es.monitoring.cloud.ibm.com", + "eu-gb": "https://eu-gb.monitoring.cloud.ibm.com", + "jp-tok": "https://jp-tok.monitoring.cloud.ibm.com", + "au-syd": "https://au-syd.monitoring.cloud.ibm.com", + "jp-osa": "https://jp-osa.monitoring.cloud.ibm.com", + "ca-tor": "https://ca-tor.monitoring.cloud.ibm.com", + "br-sao": "https://br-sao.monitoring.cloud.ibm.com", + } + + def __init__(self, iam_auth: IBMCloudIAMAuth, instance_id: str, region: str): + """ + Initialize the Sysdig Dashboard Manager. + + Args: + iam_auth: IBM Cloud IAM authentication handler + instance_id: IBM Cloud Monitoring instance ID (GUID) + region: IBM Cloud region (e.g., 'us-south', 'eu-de') + """ + if region not in self.REGION_ENDPOINTS: + raise ValueError( + f"Invalid region '{region}'. Valid regions: {', '.join(self.REGION_ENDPOINTS.keys())}" + ) + + self.iam_auth = iam_auth + self.instance_id = instance_id + self.region = region + self.base_url = self.REGION_ENDPOINTS[region] + + def _get_headers(self) -> Dict[str, str]: + """ + Get HTTP headers with IAM authentication. + + Returns: + Dictionary of HTTP headers + """ + access_token = self.iam_auth.get_access_token() + return { + "Authorization": f"Bearer {access_token}", + "IBMInstanceID": self.instance_id, + "Content-Type": "application/json", + } + + def list_dashboards(self) -> list: + """ + List all dashboards in the Sysdig instance. + + Returns: + List of dashboard objects + """ + url = f"{self.base_url}/api/v3/dashboards" + + try: + response = requests.get(url, headers=self._get_headers(), timeout=30) + response.raise_for_status() + data = response.json() + return data.get("dashboards", []) + except requests.exceptions.RequestException as e: + print(f"Error listing dashboards: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + return [] + + def find_dashboard_by_name(self, name: str) -> Optional[Dict]: + """ + Find a dashboard by its name. + + Args: + name: Dashboard name to search for + + Returns: + Dashboard object if found, None otherwise + """ + dashboards = self.list_dashboards() + for dashboard in dashboards: + print(f"dashboard: {dashboard.get("name")}, id: '{dashboard.get("id")}'") + if dashboard.get("name") == name: + return dashboard + return None + + def get_dashboard(self, dashboard_id: int) -> Dict: + """ + Get a dashboard by its ID. + + Args: + dashboard_id: ID of the dashboard to retrieve + + Returns: + Dashboard object + """ + url = f"{self.base_url}/api/v3/dashboards/{dashboard_id}" + + try: + response = requests.get(url, headers=self._get_headers(), timeout=30) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error retrieving dashboard: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + def export_dashboard_by_name(self, name: str, output_dir: str = ".") -> str: + """ + Export a dashboard by its name to a JSON file. + + Args: + name: Dashboard name to export + output_dir: Directory to save the exported file (default: current directory) + + Returns: + Path to the exported file + """ + print(f"Searching for dashboard '{name}'...") + dashboard_summary = self.find_dashboard_by_name(name) + + if not dashboard_summary: + raise ValueError(f"Dashboard '{name}' not found") + + dashboard_id = dashboard_summary.get("id") + if dashboard_id is None: + raise ValueError(f"Dashboard '{name}' found but has no ID") + + print(f"✓ Dashboard found (ID: {dashboard_id})") + print(f"Retrieving full dashboard configuration...") + + # Get the full dashboard configuration + dashboard_data = self.get_dashboard(dashboard_id) + + # Extract just the dashboard object (without wrapper) + dashboard_config = dashboard_data.get("dashboard", {}) + + # Generate filename with dashboard name and timestamp + timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S") + # Sanitize dashboard name for filename + safe_name = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in name) + safe_name = safe_name.replace(' ', '_').lower() + filename = f"{safe_name}_{timestamp}.json" + filepath = os.path.join(output_dir, filename) + + # Save to file + print(f"Saving dashboard to '{filepath}'...") + with open(filepath, 'w') as f: + json.dump(dashboard_config, f, indent=2) + + print(f"✓ Dashboard exported successfully!") + return filepath + + +def main(): + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Export IBM Cloud Monitoring (Sysdig) dashboards using IBM Cloud IAM authentication", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Using command-line arguments + python export_dashboard.py \\ + --iam-api-key YOUR_IAM_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --name "IBM Code Engine - Container Resource Overview" + + # Using environment variables + export IBM_CLOUD_IAM_API_KEY=YOUR_IAM_KEY + export SYSDIG_INSTANCE_ID=YOUR_INSTANCE_ID + export SYSDIG_REGION=us-south + python export_dashboard.py --name "My Dashboard" + + # Export to specific directory + python export_dashboard.py \\ + --name "My Dashboard" \\ + --output-dir ./exports + +Supported Regions: + us-south, us-east, eu-de, eu-es, eu-gb, jp-tok, au-syd, jp-osa, ca-tor, br-sao + +How to get your Instance ID: + 1. Go to IBM Cloud Console + 2. Navigate to your Monitoring instance + 3. Click on "Overview" or "Settings" + 4. Copy the Instance ID (GUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) + """ + ) + + parser.add_argument( + "--iam-api-key", + help="IBM Cloud IAM API key (or set IBM_CLOUD_IAM_API_KEY env var)", + default=os.environ.get("IBM_CLOUD_IAM_API_KEY") + ) + + parser.add_argument( + "--instance-id", + help="IBM Cloud Monitoring instance ID/GUID (or set SYSDIG_INSTANCE_ID env var)", + default=os.environ.get("SYSDIG_INSTANCE_ID") + ) + + parser.add_argument( + "--region", + help="IBM Cloud region (or set SYSDIG_REGION env var)", + default=os.environ.get("SYSDIG_REGION") + ) + + parser.add_argument( + "--name", + required=True, + help="Name of the dashboard to export" + ) + + parser.add_argument( + "--output-dir", + default=".", + help="Directory to save the exported dashboard (default: current directory)" + ) + + parser.add_argument( + "--list", + action="store_true", + help="List all available dashboards and exit" + ) + + args = parser.parse_args() + + # Validate required arguments + if not args.iam_api_key: + print("Error: IAM API key is required. Provide via --iam-api-key or IBM_CLOUD_IAM_API_KEY environment variable") + sys.exit(1) + + if not args.instance_id: + print("Error: Instance ID is required. Provide via --instance-id or SYSDIG_INSTANCE_ID environment variable") + sys.exit(1) + + if not args.region: + print("Error: Region is required. Provide via --region or SYSDIG_REGION environment variable") + sys.exit(1) + + # Initialize IAM authentication and dashboard manager + try: + iam_auth = IBMCloudIAMAuth(args.iam_api_key) + manager = SysdigDashboardManager(iam_auth, args.instance_id, args.region) + + # List dashboards if requested + if args.list: + print("Listing all dashboards...") + dashboards = manager.list_dashboards() + if not dashboards: + print("No dashboards found") + else: + print(f"\nFound {len(dashboards)} dashboard(s):\n") + for i, dashboard in enumerate(dashboards, 1): + name = dashboard.get("name", "Unnamed") + dashboard_id = dashboard.get("id", "N/A") + print(f"{i}. {name} (ID: {dashboard_id})") + sys.exit(0) + + # Create output directory if it doesn't exist + if args.output_dir != "." and not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + print(f"Created output directory: {args.output_dir}") + + # Export the dashboard + filepath = manager.export_dashboard_by_name(args.name, args.output_dir) + + print(f"\n✓ Export completed successfully!") + print(f" File: {filepath}") + + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/metrics-collector/setup/ibm-cloud-monitoring/import_dashboard.py b/metrics-collector/setup/ibm-cloud-monitoring/import_dashboard.py new file mode 100644 index 000000000..2dcf4263f --- /dev/null +++ b/metrics-collector/setup/ibm-cloud-monitoring/import_dashboard.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python3 +""" +IBM Cloud Monitoring Dashboard Import/Update Script + +This script creates or updates Sysdig dashboards in IBM Cloud Monitoring using IBM Cloud IAM authentication. +It uses an IBM Cloud IAM API key to obtain an access token, then interacts with the Sysdig API. + +Usage: + python import_dashboard.py --iam-api-key --instance-id --region --dashboard + +Environment Variables: + IBM_CLOUD_IAM_API_KEY: IBM Cloud IAM API key (alternative to --iam-api-key) + SYSDIG_INSTANCE_ID: IBM Cloud Monitoring instance ID (alternative to --instance-id) + SYSDIG_REGION: IBM Cloud Monitoring region (alternative to --region) + +Example: + python import_dashboard.py \\ + --iam-api-key YOUR_IAM_API_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --dashboard code-engine-component-resource-overview.json +""" + +import argparse +import json +import os +import sys +from typing import Dict, Optional + +try: + import requests +except ImportError: + print("Error: 'requests' module not found. Install it with: pip install requests") + sys.exit(1) + + +class IBMCloudIAMAuth: + """Handles IBM Cloud IAM authentication.""" + + IAM_TOKEN_URL = "https://iam.cloud.ibm.com/identity/token" + + def __init__(self, iam_api_key: str): + """ + Initialize IBM Cloud IAM authentication. + + Args: + iam_api_key: IBM Cloud IAM API key + """ + self.iam_api_key = iam_api_key + self._access_token = None + self._token_expiry = 0 + + def get_access_token(self) -> str: + """ + Get an IBM Cloud IAM access token. + + Returns: + IAM access token + """ + print("Obtaining IBM Cloud IAM access token...") + + headers = { + "Content-Type": "application/x-www-form-urlencoded", + "Accept": "application/json" + } + + data = { + "grant_type": "urn:ibm:params:oauth:grant-type:apikey", + "apikey": self.iam_api_key + } + + try: + response = requests.post( + self.IAM_TOKEN_URL, + headers=headers, + data=data, + timeout=30 + ) + response.raise_for_status() + token_data = response.json() + self._access_token = token_data.get("access_token") + + if not self._access_token: + raise ValueError("No access token in IAM response") + + print("✓ IAM access token obtained successfully") + return self._access_token + + except requests.exceptions.RequestException as e: + print(f"Error obtaining IAM token: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + +class SysdigDashboardManager: + """Manages Sysdig dashboard creation and updates via REST API with IBM Cloud IAM authentication.""" + + # IBM Cloud Monitoring regional endpoints + REGION_ENDPOINTS = { + "us-south": "https://us-south.monitoring.cloud.ibm.com", + "us-east": "https://us-east.monitoring.cloud.ibm.com", + "eu-de": "https://eu-de.monitoring.cloud.ibm.com", + "eu-es": "https://eu-es.monitoring.cloud.ibm.com", + "eu-gb": "https://eu-gb.monitoring.cloud.ibm.com", + "jp-tok": "https://jp-tok.monitoring.cloud.ibm.com", + "au-syd": "https://au-syd.monitoring.cloud.ibm.com", + "jp-osa": "https://jp-osa.monitoring.cloud.ibm.com", + "ca-tor": "https://ca-tor.monitoring.cloud.ibm.com", + "br-sao": "https://br-sao.monitoring.cloud.ibm.com", + } + + def __init__(self, iam_auth: IBMCloudIAMAuth, instance_id: str, region: str): + """ + Initialize the Sysdig Dashboard Manager. + + Args: + iam_auth: IBM Cloud IAM authentication handler + instance_id: IBM Cloud Monitoring instance ID (GUID) + region: IBM Cloud region (e.g., 'us-south', 'eu-de') + """ + if region not in self.REGION_ENDPOINTS: + raise ValueError( + f"Invalid region '{region}'. Valid regions: {', '.join(self.REGION_ENDPOINTS.keys())}" + ) + + self.iam_auth = iam_auth + self.instance_id = instance_id + self.region = region + self.base_url = self.REGION_ENDPOINTS[region] + + def _get_headers(self) -> Dict[str, str]: + """ + Get HTTP headers with IAM authentication. + + Returns: + Dictionary of HTTP headers + """ + access_token = self.iam_auth.get_access_token() + return { + "Authorization": f"Bearer {access_token}", + "IBMInstanceID": self.instance_id, + "Content-Type": "application/json", + } + + def list_dashboards(self) -> list: + """ + List all dashboards in the Sysdig instance. + + Returns: + List of dashboard objects + """ + url = f"{self.base_url}/api/v3/dashboards" + + try: + response = requests.get(url, headers=self._get_headers(), timeout=30) + response.raise_for_status() + data = response.json() + return data.get("dashboards", []) + except requests.exceptions.RequestException as e: + print(f"Error listing dashboards: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + return [] + + def find_dashboard_by_name(self, name: str) -> Optional[Dict]: + """ + Find a dashboard by its name. + + Args: + name: Dashboard name to search for + + Returns: + Dashboard object if found, None otherwise + """ + dashboards = self.list_dashboards() + for dashboard in dashboards: + if dashboard.get("name") == name: + return dashboard + return None + + def create_dashboard(self, dashboard_config: Dict) -> Dict: + """ + Create a new dashboard. + + Args: + dashboard_config: Dashboard configuration dictionary + + Returns: + Created dashboard object + """ + url = f"{self.base_url}/api/v3/dashboards" + + try: + response = requests.post( + url, + headers=self._get_headers(), + json={"dashboard": dashboard_config}, + timeout=30 + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error creating dashboard: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + def update_dashboard(self, dashboard_id: int, dashboard_config: Dict) -> Dict: + """ + Update an existing dashboard. + + Args: + dashboard_id: ID of the dashboard to update + dashboard_config: New dashboard configuration + + Returns: + Updated dashboard object + """ + url = f"{self.base_url}/api/v3/dashboards/{dashboard_id}" + + try: + response = requests.put( + url, + headers=self._get_headers(), + json={"dashboard": dashboard_config}, + timeout=30 + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as e: + print(f"Error updating dashboard: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f"Response: {e.response.text}") + raise + + def import_or_update_dashboard(self, dashboard_config: Dict) -> Dict: + """ + Import a dashboard or update it if it already exists. + + Args: + dashboard_config: Dashboard configuration dictionary + + Returns: + Dashboard object (created or updated) + """ + dashboard_name = dashboard_config.get("name") + if not dashboard_name: + raise ValueError("Dashboard configuration must include a 'name' field") + + print(f"Checking if dashboard '{dashboard_name}' exists...") + existing_dashboard = self.find_dashboard_by_name(dashboard_name) + + if existing_dashboard: + dashboard_id = existing_dashboard.get("id") + if dashboard_id is None: + raise ValueError(f"Dashboard '{dashboard_name}' found but has no ID") + print(f"Dashboard '{dashboard_name}' found (ID: {dashboard_id}). Updating...") + result = self.update_dashboard(dashboard_id, dashboard_config) + print(f"✓ Dashboard '{dashboard_name}' updated successfully!") + return result + else: + print(f"Dashboard '{dashboard_name}' not found. Creating new dashboard...") + result = self.create_dashboard(dashboard_config) + dashboard_id = result.get("dashboard", {}).get("id") + print(f"✓ Dashboard '{dashboard_name}' created successfully (ID: {dashboard_id})!") + return result + + +def load_dashboard_config(file_path: str) -> Dict: + """ + Load dashboard configuration from a JSON file. + + Args: + file_path: Path to the JSON file + + Returns: + Dashboard configuration dictionary + """ + try: + with open(file_path, 'r') as f: + return json.load(f) + except FileNotFoundError: + print(f"Error: Dashboard file '{file_path}' not found") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON in dashboard file: {e}") + sys.exit(1) + + +def main(): + """Main entry point for the script.""" + parser = argparse.ArgumentParser( + description="Import or update IBM Cloud Monitoring (Sysdig) dashboards using IBM Cloud IAM authentication", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Using command-line arguments + python import_dashboard.py \\ + --iam-api-key YOUR_IAM_KEY \\ + --instance-id YOUR_INSTANCE_ID \\ + --region us-south \\ + --dashboard code-engine-component-resource-overview.json + + # Using environment variables + export IBM_CLOUD_IAM_API_KEY=YOUR_IAM_KEY + export SYSDIG_INSTANCE_ID=YOUR_INSTANCE_ID + export SYSDIG_REGION=us-south + python import_dashboard.py --dashboard code-engine-component-resource-overview.json + +Supported Regions: + us-south, us-east, eu-de, eu-gb, jp-tok, au-syd, jp-osa, ca-tor, br-sao + +How to get your Instance ID: + 1. Go to IBM Cloud Console + 2. Navigate to your Monitoring instance + 3. Click on "Overview" or "Settings" + 4. Copy the Instance ID (GUID format: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx) + """ + ) + + parser.add_argument( + "--iam-api-key", + help="IBM Cloud IAM API key (or set IBM_CLOUD_IAM_API_KEY env var)", + default=os.environ.get("IBM_CLOUD_IAM_API_KEY") + ) + + parser.add_argument( + "--instance-id", + help="IBM Cloud Monitoring instance ID/GUID (or set SYSDIG_INSTANCE_ID env var)", + default=os.environ.get("SYSDIG_INSTANCE_ID") + ) + + parser.add_argument( + "--region", + help="IBM Cloud region (or set SYSDIG_REGION env var)", + default=os.environ.get("SYSDIG_REGION") + ) + + parser.add_argument( + "--dashboard", + required=True, + help="Path to dashboard JSON file" + ) + + args = parser.parse_args() + + # Validate required arguments + if not args.iam_api_key: + print("Error: IAM API key is required. Provide via --iam-api-key or IBM_CLOUD_IAM_API_KEY environment variable") + sys.exit(1) + + if not args.instance_id: + print("Error: Instance ID is required. Provide via --instance-id or SYSDIG_INSTANCE_ID environment variable") + sys.exit(1) + + if not args.region: + print("Error: Region is required. Provide via --region or SYSDIG_REGION environment variable") + sys.exit(1) + + # Load dashboard configuration + print(f"Loading dashboard configuration from '{args.dashboard}'...") + dashboard_config = load_dashboard_config(args.dashboard) + + # Initialize IAM authentication and dashboard manager + try: + iam_auth = IBMCloudIAMAuth(args.iam_api_key) + manager = SysdigDashboardManager(iam_auth, args.instance_id, args.region) + result = manager.import_or_update_dashboard(dashboard_config) + + # Print dashboard URL + dashboard_id = result.get("dashboard", {}).get("id") + if dashboard_id: + dashboard_url = f"{manager.base_url}/#/dashboards/{dashboard_id}" + print(f"\nDashboard URL: {dashboard_url}") + + print("\n✓ Operation completed successfully!") + + except ValueError as e: + print(f"Error: {e}") + sys.exit(1) + except Exception as e: + print(f"Unexpected error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/metrics-collector/start.sh b/metrics-collector/start.sh new file mode 100644 index 000000000..f5caf7daf --- /dev/null +++ b/metrics-collector/start.sh @@ -0,0 +1,93 @@ +#!/bin/sh +set -e + +echo "Starting Code Engine Metrics Collector..." + +# Check if METRICS_ENABLED is set to true +if [ "$METRICS_ENABLED" = "true" ]; then + echo "Prometheus metrics export enabled" + + # Check if monitoring API key secret is mounted + if [ ! -f "/etc/secrets/monitoring-apikey" ]; then + echo "ERROR: Prometheus agent requires /etc/secrets/monitoring-apikey to be mounted" + echo "Please create a secret with your IBM Cloud Monitoring API key and mount it at /etc/secrets/monitoring-apikey" + echo "Example:" + echo " ibmcloud ce secret create --name monitoring-apikey --from-literal monitoring-apikey=YOUR_API_KEY" + echo " ibmcloud ce job update --name metrics-collector --mount-secret /etc/secrets=monitoring-apikey" + exit 1 + fi + + # Check required environment variables + if [ -z "$CE_SUBDOMAIN" ]; then + echo "ERROR: CE_SUBDOMAIN environment variable is required when METRICS_ENABLED=true" + exit 1 + fi + + if [ -z "$METRICS_REMOTE_WRITE_FQDN" ]; then + echo "ERROR: METRICS_REMOTE_WRITE_FQDN environment variable is required when METRICS_ENABLED=true" + exit 1 + fi + + # Generate prometheus.yml from template with environment variable substitution + echo "Generating Prometheus configuration..." + sed -e "s/\${CE_SUBDOMAIN}/$CE_SUBDOMAIN/g" \ + -e "s/\${METRICS_REMOTE_WRITE_FQDN}/$METRICS_REMOTE_WRITE_FQDN/g" \ + /etc/prometheus/prometheus.yml.template > /tmp/prometheus.yml + + echo "Starting Prometheus agent..." + /bin/prometheus --config.file=/tmp/prometheus.yml --agent --storage.agent.path=/tmp/agent-data --log.level info --log.format json 2>&1 & + PROMETHEUS_PID=$! + echo "Prometheus agent started with PID $PROMETHEUS_PID" + + # Give Prometheus a moment to start and check if it's actually running + sleep 2 + if ! kill -0 "$PROMETHEUS_PID" 2>/dev/null; then + echo "ERROR: Prometheus agent failed to start" + exit 1 + fi +else + echo "Prometheus metrics export disabled (METRICS_ENABLED not set to 'true')" +fi + +# Start the metrics collector +echo "Starting metrics collector..." +/app & +APP_PID=$! +echo "Metrics collector started with PID $APP_PID" + +# Function to handle shutdown +shutdown() { + echo "Shutting down..." + if [ -n "$APP_PID" ]; then + kill -TERM "$APP_PID" 2>/dev/null || true + fi + if [ -n "$PROMETHEUS_PID" ]; then + kill -TERM "$PROMETHEUS_PID" 2>/dev/null || true + fi + wait + exit 0 +} + +# Trap signals +trap shutdown SIGTERM SIGINT + +# Monitor processes +while true; do + # Check if app is still running + if ! kill -0 "$APP_PID" 2>/dev/null; then + echo "ERROR: Metrics collector process died unexpectedly" + if [ -n "$APP_PID" ]; then + kill -TERM "$APP_PID" 2>/dev/null || true + fi + exit 1 + fi + + # Check if Prometheus is still running (only if it was started) + if [ "$METRICS_ENABLED" = "true" ] && ! kill -0 "$PROMETHEUS_PID" 2>/dev/null; then + echo "ERROR: Prometheus agent process died unexpectedly" + kill -TERM "$PROMETHEUS_PID" 2>/dev/null || true + exit 1 + fi + + sleep 5 +done diff --git a/private-path-to-vpc-vsi/ce-app/Dockerfile b/private-path-to-vpc-vsi/ce-app/Dockerfile index 7ae1e0829..93565d9e5 100644 --- a/private-path-to-vpc-vsi/ce-app/Dockerfile +++ b/private-path-to-vpc-vsi/ce-app/Dockerfile @@ -1,10 +1,10 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +FROM quay.io/projectquay/golang:1.25 AS build-env WORKDIR /go/src/app COPY . . RUN CGO_ENABLED=0 go build -o /go/bin/app . # Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian13 COPY --from=build-env /go/bin/app / ENTRYPOINT ["/app"] diff --git a/private-path-to-vpc-vsi/ce-app/go.mod b/private-path-to-vpc-vsi/ce-app/go.mod index bff2499af..f65b725d1 100644 --- a/private-path-to-vpc-vsi/ce-app/go.mod +++ b/private-path-to-vpc-vsi/ce-app/go.mod @@ -1,5 +1,5 @@ module github.com/IBM/CodeEngine/ce-private-path -go 1.23.0 +go 1.25 require github.com/lib/pq v1.10.9 diff --git a/satellite-connector-to-vpc-vsi/ce-app/Dockerfile b/satellite-connector-to-vpc-vsi/ce-app/Dockerfile index 7ae1e0829..93565d9e5 100644 --- a/satellite-connector-to-vpc-vsi/ce-app/Dockerfile +++ b/satellite-connector-to-vpc-vsi/ce-app/Dockerfile @@ -1,10 +1,10 @@ -FROM quay.io/projectquay/golang:1.23 AS build-env +FROM quay.io/projectquay/golang:1.25 AS build-env WORKDIR /go/src/app COPY . . RUN CGO_ENABLED=0 go build -o /go/bin/app . # Copy the exe into a smaller base image -FROM gcr.io/distroless/static-debian12 +FROM gcr.io/distroless/static-debian13 COPY --from=build-env /go/bin/app / ENTRYPOINT ["/app"] diff --git a/satellite-connector-to-vpc-vsi/ce-app/go.mod b/satellite-connector-to-vpc-vsi/ce-app/go.mod index 06d27fb97..4511f6fe9 100644 --- a/satellite-connector-to-vpc-vsi/ce-app/go.mod +++ b/satellite-connector-to-vpc-vsi/ce-app/go.mod @@ -1,5 +1,5 @@ module github.com/IBM/CodeEngine/ce-satellite-connector -go 1.21.0 +go 1.25 require github.com/lib/pq v1.10.9