diff --git a/package.json b/package.json index 03ca539..6367c01 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,7 @@ "deploy": "wrangler deploy", "start": "wrangler dev", "dev": "wrangler dev", - "cf-typegen": "wrangler types", + "cf-typegen": "wrangler types --strict-vars=false", "typecheck": "tsgo", "lint": "biome check .", "lint:fix": "biome check --write .", diff --git a/readme.md b/readme.md index 1b0243c..b18eaff 100644 --- a/readme.md +++ b/readme.md @@ -33,13 +33,36 @@ To run GraphQL queries from a specific jurisdiction (closer to the data source), ```jsonc "vars": { - "BATCH_SIZE": "5000", - "JURISDICTION": "eu" // e.g., "eu", "fedramp" + "BATCH_SIZE": 5000, + "RETRY_LIMIT": 3, + "RETRY_DELAY_SECONDS": 1, + "JURISDICTION": "eu", // e.g., "eu", "fedramp" + "DATADOG_TAGS": {} } ``` This uses a Durable Object to proxy requests from the specified jurisdiction. +#### Optional: Custom Datadog Tags + +Add custom tags to all metrics by setting the `DATADOG_TAGS` variable in `wrangler.jsonc`: + +```jsonc +"vars": { + "BATCH_SIZE": 5000, + "RETRY_LIMIT": 3, + "RETRY_DELAY_SECONDS": 1, + "JURISDICTION": "eu", + "DATADOG_TAGS": { + "env": "production", + "team": "platform", + "service": "containers" + } +} +``` + +These tags will be added to all health and resource metrics sent to Datadog. + ### Verify ```bash @@ -123,11 +146,21 @@ See [Datadog's documentation](https://docs.datadoghq.com/dashboards/configure/#c ## Workflow Behavior -The exporter runs as a Cloudflare Workflow triggered every minute via cron. Each workflow step uses the default retry configuration: +The exporter runs as a Cloudflare Workflow triggered every minute via cron. Each workflow step uses configurable retry settings: -- **Retries**: 3 attempts -- **Delay**: 1 second initial delay -- **Backoff**: Exponential (1s, 2s, 4s) +- **Retries**: Configurable via `RETRY_LIMIT` (default: 3 attempts) +- **Delay**: Configurable via `RETRY_DELAY_SECONDS` (default: 1 second initial delay) +- **Backoff**: Exponential (e.g., 1s, 2s, 4s) Steps will automatically retry on transient failures (API errors, network issues). +### Configuration Options + +| Variable | Type | Default | Description | +|----------|------|---------|-------------| +| `BATCH_SIZE` | number | 5000 | Maximum metrics per Datadog API request | +| `RETRY_LIMIT` | number | 3 | Number of retry attempts for failed workflow steps | +| `RETRY_DELAY_SECONDS` | number | 1 | Initial delay in seconds before retry (exponential backoff) | +| `JURISDICTION` | string | "" | Durable Object jurisdiction for GraphQL queries (e.g., "eu", "fedramp") | +| `DATADOG_TAGS` | object | {} | Custom tags to add to all metrics | + diff --git a/src/metrics.ts b/src/metrics.ts index 4af2dbc..74ac774 100644 --- a/src/metrics.ts +++ b/src/metrics.ts @@ -1,17 +1,29 @@ +import { z } from "zod/v4"; import type { DatadogMetric } from "./api/datadog"; import type { Container, MetricsGroup } from "./types"; -export interface ContainerWithMetrics { - container: Container; - metrics: MetricsGroup[]; -} - export interface ContainerInfo { id: string; name: string; version: number; } +const DatadogTagsSchema = z.record(z.string(), z.string()).optional(); + +function parseCustomTags(datadogTags: unknown): string[] { + const parsed = DatadogTagsSchema.safeParse(datadogTags); + if (!parsed.success) { + console.warn("Invalid DATADOG_TAGS format, ignoring custom tags", { + error: parsed.error.message, + }); + return []; + } + if (!parsed.data) { + return []; + } + return Object.entries(parsed.data).map(([key, value]) => `${key}:${value}`); +} + /** * Format metrics for a single container into Datadog metrics */ @@ -20,7 +32,9 @@ export function formatMetricsForContainer( container: ContainerInfo, metricsGroups: MetricsGroup[], timestamp?: number, + datadogTags?: unknown, ): DatadogMetric[] { + const customTags = parseCustomTags(datadogTags); const ts = timestamp ?? Math.floor(Date.now() / 1000); const metrics: DatadogMetric[] = []; @@ -32,6 +46,7 @@ export function formatMetricsForContainer( `version:${container.version}`, `instance_id:${group.dimensions.deploymentId}`, `placement_id:${group.dimensions.placementId}`, + ...customTags, ]; // CPU metrics @@ -138,26 +153,6 @@ export function formatMetricsForContainer( return metrics; } -/** - * Format container metrics data into Datadog metrics - */ -export function formatContainerMetrics( - accountId: string, - containersWithMetrics: ContainerWithMetrics[], - timestamp?: number, -): DatadogMetric[] { - const ts = timestamp ?? Math.floor(Date.now() / 1000); - const metrics: DatadogMetric[] = []; - - for (const { container, metrics: groups } of containersWithMetrics) { - metrics.push( - ...formatMetricsForContainer(accountId, container, groups, ts), - ); - } - - return metrics; -} - /** * Format container health data into Datadog metrics */ @@ -165,9 +160,11 @@ export function formatHealthMetrics( accountId: string, containers: Container[], timestamp?: number, + datadogTags?: unknown, ): DatadogMetric[] { + const customTags = parseCustomTags(datadogTags); const ts = timestamp ?? Math.floor(Date.now() / 1000); - const baseTags = [`account_id:${accountId}`]; + const baseTags = [`account_id:${accountId}`, ...customTags]; const metrics: DatadogMetric[] = []; const totals = { diff --git a/src/workflow.ts b/src/workflow.ts index 7ec2490..4d06836 100644 --- a/src/workflow.ts +++ b/src/workflow.ts @@ -27,17 +27,19 @@ function getMetricsTimeWindow(now: Date = new Date()): { return { start, end }; } -const STEP_CONFIG = { - retries: { - limit: 3, - delay: "1 second" as const, - backoff: "exponential" as const, - }, -}; - export class MetricsExporterWorkflow extends WorkflowEntrypoint { async run(_event: WorkflowEvent, step: WorkflowStep) { - const batchSize = Number.parseInt(this.env.BATCH_SIZE || "5000", 10); + const batchSize = this.env.BATCH_SIZE ?? 5000; + const retryLimit = this.env.RETRY_LIMIT ?? 3; + const retryDelaySeconds = this.env.RETRY_DELAY_SECONDS ?? 1; + + const stepConfig = { + retries: { + limit: retryLimit, + delay: `${retryDelaySeconds} seconds` as const, + backoff: "exponential" as const, + }, + }; // Create a fetcher that proxies requests through a Durable Object in a specific jurisdiction // This ensures GraphQL queries run close to the data source @@ -70,7 +72,7 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint { const containers = await step.do( "fetch containers", - STEP_CONFIG, + stepConfig, async () => { const result = await cloudflare.listContainers(); console.log("Fetched containers", { count: result.length }); @@ -78,6 +80,8 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint { const healthMetrics = formatHealthMetrics( this.env.CLOUDFLARE_ACCOUNT_ID, result, + undefined, + this.env.DATADOG_TAGS, ); await datadog.sendMetrics(healthMetrics); @@ -94,7 +98,7 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint { for (const container of containers) { const count = await step.do( `Download Metrics: ${container.name}`, - STEP_CONFIG, + stepConfig, async () => { const metricsGroups = await cloudflare.getContainerMetrics( container.id, @@ -106,6 +110,8 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint { this.env.CLOUDFLARE_ACCOUNT_ID, container, metricsGroups, + undefined, + this.env.DATADOG_TAGS, ); const batches = chunk(metrics, batchSize); @@ -115,7 +121,7 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint { (batch, i) => () => step.do( `Export Metrics: ${container.name} batch ${i + 1}/${batches.length}`, - STEP_CONFIG, + stepConfig, async () => { await datadog.sendMetrics(batch); }, diff --git a/test/metricsformatting.test.ts b/test/metricsformatting.test.ts index f9fb833..59a9f71 100644 --- a/test/metricsformatting.test.ts +++ b/test/metricsformatting.test.ts @@ -1,10 +1,5 @@ -import { describe, expect, it } from "vitest"; -import { - type ContainerWithMetrics, - formatContainerMetrics, - formatHealthMetrics, - formatMetricsForContainer, -} from "../src/metrics"; +import { describe, expect, it, vi } from "vitest"; +import { formatHealthMetrics, formatMetricsForContainer } from "../src/metrics"; import { createMockMetricsGroup, mockContainers, @@ -73,214 +68,105 @@ describe("formatMetricsForContainer", () => { expect(cpuMetric?.tags).toContain("placement_id:placement-test"); }); - it("returns empty array for no metrics groups", () => { + it("includes custom tags when provided", () => { const container = { id: "app-123", name: "my-app", version: 1 }; + const datadogTags = { env: "production", team: "platform" }; const metrics = formatMetricsForContainer( TEST_ACCOUNT_ID, container, - [], - TEST_TIMESTAMP, - ); - expect(metrics).toHaveLength(0); - }); -}); - -describe("formatContainerMetrics", () => { - it("formats metrics for a single container with one metrics group", () => { - const containersWithMetrics: ContainerWithMetrics[] = [ - { - container: mockContainers[0], - metrics: [mockMetricsGroups[0]], - }, - ]; - - const metrics = formatContainerMetrics( - TEST_ACCOUNT_ID, - containersWithMetrics, + [mockMetricsGroups[0]], TEST_TIMESTAMP, + datadogTags, ); - // 4 CPU + 4 Memory + 4 Disk + 2 Bandwidth = 14 metrics per group - expect(metrics).toHaveLength(14); - }); - - it("formats metrics for multiple containers with multiple groups", () => { - const containersWithMetrics: ContainerWithMetrics[] = [ - { - container: mockContainers[0], - metrics: mockMetricsGroups, // 2 groups - }, - { - container: mockContainers[1], - metrics: [mockMetricsGroups[0]], // 1 group - }, - ]; - - const metrics = formatContainerMetrics( - TEST_ACCOUNT_ID, - containersWithMetrics, - TEST_TIMESTAMP, + const cpuMetric = metrics.find( + (m) => + m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"), ); - // 3 groups * 14 metrics = 42 metrics - expect(metrics).toHaveLength(42); + expect(cpuMetric).toBeDefined(); + expect(cpuMetric?.tags).toContain("env:production"); + expect(cpuMetric?.tags).toContain("team:platform"); }); - it("includes correct tags for each metric", () => { - const group = createMockMetricsGroup({ - dimensions: { - applicationId: "app-test", - datetimeMinute: "2025-12-05T16:00:00Z", - deploymentId: "instance-test", - placementId: "placement-test", - }, - }); + it("handles invalid datadogTags gracefully", () => { + const container = { id: "app-123", name: "my-app", version: 1 }; + const invalidTags = { env: "prod", count: 123 } as unknown; - const containersWithMetrics: ContainerWithMetrics[] = [ - { - container: mockContainers[0], - metrics: [group], - }, - ]; + const consoleWarnSpy = vi.spyOn(console, "warn").mockImplementation(); - const metrics = formatContainerMetrics( + const metrics = formatMetricsForContainer( TEST_ACCOUNT_ID, - containersWithMetrics, + container, + [mockMetricsGroups[0]], TEST_TIMESTAMP, + invalidTags, ); + expect(metrics).toHaveLength(14); const cpuMetric = metrics.find( (m) => m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"), ); - - expect(cpuMetric).toBeDefined(); - expect(cpuMetric?.tags).toContain(`account_id:${TEST_ACCOUNT_ID}`); - expect(cpuMetric?.tags).toContain("application_id:app-test"); - expect(cpuMetric?.tags).toContain( - `application_name:${mockContainers[0].name}`, + expect(cpuMetric?.tags).not.toContain("count:123"); + expect(cpuMetric?.tags).not.toContain("env:prod"); + expect(consoleWarnSpy).toHaveBeenCalledWith( + "Invalid DATADOG_TAGS format, ignoring custom tags", + expect.objectContaining({ error: expect.any(String) }), ); - expect(cpuMetric?.tags).toContain("instance_id:instance-test"); - expect(cpuMetric?.tags).toContain("placement_id:placement-test"); - expect(cpuMetric?.tags).toContain("stat:p50"); - }); - it("uses correct metric values from the group", () => { - const group = createMockMetricsGroup({ - max: { cpuLoad: 0.99, memory: 999999999, diskUsage: 5000000000 }, - quantiles: { - cpuLoadP50: 0.42, - cpuLoadP90: 0.8, - cpuLoadP99: 0.95, - memoryP50: 100000000, - memoryP90: 200000000, - memoryP99: 300000000, - diskUsageP50: 1000000000, - diskUsageP90: 2000000000, - diskUsageP99: 4000000000, - }, - sum: { rxBytes: 1000000, txBytes: 500000 }, - }); + consoleWarnSpy.mockRestore(); + }); - const containersWithMetrics: ContainerWithMetrics[] = [ - { - container: mockContainers[0], - metrics: [group], - }, - ]; + it("handles empty datadogTags object", () => { + const container = { id: "app-123", name: "my-app", version: 1 }; + const datadogTags = {}; - const metrics = formatContainerMetrics( + const metrics = formatMetricsForContainer( TEST_ACCOUNT_ID, - containersWithMetrics, + container, + [mockMetricsGroups[0]], TEST_TIMESTAMP, + datadogTags, ); - // Check CPU p50 - const cpuP50 = metrics.find( + expect(metrics).toHaveLength(14); + const cpuMetric = metrics.find( (m) => m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"), ); - expect(cpuP50?.points[0]).toEqual([TEST_TIMESTAMP, 0.42]); - - // Check CPU max - const cpuMax = metrics.find( - (m) => - m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:max"), - ); - expect(cpuMax?.points[0]).toEqual([TEST_TIMESTAMP, 0.99]); - - // Check memory p99 - const memoryP99 = metrics.find( - (m) => - m.metric === "cloudflare.containers.memory" && - m.tags.includes("stat:p99"), - ); - expect(memoryP99?.points[0]).toEqual([TEST_TIMESTAMP, 300000000]); - - // Check bandwidth rx - const bandwidthRx = metrics.find( - (m) => m.metric === "cloudflare.containers.bandwidth.rx", - ); - expect(bandwidthRx?.points[0]).toEqual([TEST_TIMESTAMP, 1000000]); - expect(bandwidthRx?.type).toBe("count"); + expect(cpuMetric).toBeDefined(); + expect(cpuMetric?.tags).toContain(`account_id:${TEST_ACCOUNT_ID}`); }); - it("generates correct metric types", () => { - const containersWithMetrics: ContainerWithMetrics[] = [ - { - container: mockContainers[0], - metrics: [mockMetricsGroups[0]], - }, - ]; + it("handles undefined datadogTags", () => { + const container = { id: "app-123", name: "my-app", version: 1 }; - const metrics = formatContainerMetrics( + const metrics = formatMetricsForContainer( TEST_ACCOUNT_ID, - containersWithMetrics, + container, + [mockMetricsGroups[0]], TEST_TIMESTAMP, + undefined, ); - // CPU, Memory, Disk should be gauges - const cpuMetrics = metrics.filter( - (m) => m.metric === "cloudflare.containers.cpu", - ); - expect(cpuMetrics.every((m) => m.type === "gauge")).toBe(true); - - const memoryMetrics = metrics.filter( - (m) => m.metric === "cloudflare.containers.memory", - ); - expect(memoryMetrics.every((m) => m.type === "gauge")).toBe(true); - - const diskMetrics = metrics.filter( - (m) => m.metric === "cloudflare.containers.disk", - ); - expect(diskMetrics.every((m) => m.type === "gauge")).toBe(true); - - // Bandwidth should be counts - const bandwidthMetrics = metrics.filter((m) => - m.metric.startsWith("cloudflare.containers.bandwidth"), + expect(metrics).toHaveLength(14); + const cpuMetric = metrics.find( + (m) => + m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"), ); - expect(bandwidthMetrics.every((m) => m.type === "count")).toBe(true); - }); - - it("returns empty array for empty input", () => { - const metrics = formatContainerMetrics(TEST_ACCOUNT_ID, [], TEST_TIMESTAMP); - expect(metrics).toHaveLength(0); + expect(cpuMetric).toBeDefined(); + expect(cpuMetric?.tags).toContain(`account_id:${TEST_ACCOUNT_ID}`); }); - it("handles containers with no metrics", () => { - const containersWithMetrics: ContainerWithMetrics[] = [ - { - container: mockContainers[0], - metrics: [], - }, - ]; - - const metrics = formatContainerMetrics( + it("returns empty array for no metrics groups", () => { + const container = { id: "app-123", name: "my-app", version: 1 }; + const metrics = formatMetricsForContainer( TEST_ACCOUNT_ID, - containersWithMetrics, + container, + [], TEST_TIMESTAMP, ); - expect(metrics).toHaveLength(0); }); }); @@ -324,6 +210,52 @@ describe("formatHealthMetrics", () => { ).toEqual([TEST_TIMESTAMP, 15]); }); + it("includes custom tags when provided", () => { + const datadogTags = { env: "staging", region: "us-west" }; + const metrics = formatHealthMetrics( + TEST_ACCOUNT_ID, + mockContainers, + TEST_TIMESTAMP, + datadogTags, + ); + + // All metrics should include custom tags + const firstMetric = metrics[0]; + expect(firstMetric.tags).toContain("env:staging"); + expect(firstMetric.tags).toContain("region:us-west"); + }); + + it("handles invalid datadogTags gracefully", () => { + const invalidTags = { valid: "tag", invalid: 999 } as unknown; + + const metrics = formatHealthMetrics( + TEST_ACCOUNT_ID, + mockContainers, + TEST_TIMESTAMP, + invalidTags, + ); + + expect(metrics).toHaveLength(21); + const firstMetric = metrics[0]; + expect(firstMetric.tags).not.toContain("valid:tag"); + expect(firstMetric.tags).not.toContain("invalid:999"); + }); + + it("handles empty datadogTags object", () => { + const datadogTags = {}; + + const metrics = formatHealthMetrics( + TEST_ACCOUNT_ID, + mockContainers, + TEST_TIMESTAMP, + datadogTags, + ); + + expect(metrics).toHaveLength(21); + const firstMetric = metrics[0]; + expect(firstMetric.tags).toContain(`account_id:${TEST_ACCOUNT_ID}`); + }); + it("includes account_id tag", () => { const metrics = formatHealthMetrics( TEST_ACCOUNT_ID, diff --git a/worker-configuration.d.ts b/worker-configuration.d.ts index cfa6451..ec000bd 100644 --- a/worker-configuration.d.ts +++ b/worker-configuration.d.ts @@ -1,5 +1,5 @@ /* eslint-disable */ -// Generated by Wrangler by running `wrangler types` (hash: e8b40c011669a45effc49e682978268c) +// Generated by Wrangler by running `wrangler types --strict-vars=false` (hash: 3f599118a9a0126653b7d631a726e0bb) // Runtime types generated with workerd@1.20251202.0 2025-11-17 declare namespace Cloudflare { interface GlobalProps { @@ -7,8 +7,11 @@ declare namespace Cloudflare { durableNamespaces: "RequestProxy"; } interface Env { - BATCH_SIZE: "5000"; - JURISDICTION: "fedramp"; + BATCH_SIZE: number; + RETRY_LIMIT: number; + RETRY_DELAY_SECONDS: number; + JURISDICTION: string; + DATADOG_TAGS: object; CLOUDFLARE_ACCOUNT_ID: string; CLOUDFLARE_API_TOKEN: string; DATADOG_API_KEY: string; diff --git a/wrangler.jsonc b/wrangler.jsonc index a678e7d..f8477a6 100644 --- a/wrangler.jsonc +++ b/wrangler.jsonc @@ -33,7 +33,12 @@ } ], "vars": { - "BATCH_SIZE": "5000", - "JURISDICTION": "" + "BATCH_SIZE": 5000, + "RETRY_LIMIT": 3, + "RETRY_DELAY_SECONDS": 1, + "JURISDICTION": "", + "DATADOG_TAGS": { + "env": "sandbox" + } } }