Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
"deploy": "wrangler deploy",
"start": "wrangler dev",
"dev": "wrangler dev",
"cf-typegen": "wrangler types",
"cf-typegen": "wrangler types --strict-vars=false",
"typecheck": "tsgo",
"lint": "biome check .",
"lint:fix": "biome check --write .",
Expand Down
45 changes: 39 additions & 6 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,36 @@ To run GraphQL queries from a specific jurisdiction (closer to the data source),

```jsonc
"vars": {
"BATCH_SIZE": "5000",
"JURISDICTION": "eu" // e.g., "eu", "fedramp"
"BATCH_SIZE": 5000,
"RETRY_LIMIT": 3,
"RETRY_DELAY_SECONDS": 1,
"JURISDICTION": "eu", // e.g., "eu", "fedramp"
"DATADOG_TAGS": {}
}
```

This uses a Durable Object to proxy requests from the specified jurisdiction.

#### Optional: Custom Datadog Tags

Add custom tags to all metrics by setting the `DATADOG_TAGS` variable in `wrangler.jsonc`:

```jsonc
"vars": {
"BATCH_SIZE": 5000,
"RETRY_LIMIT": 3,
"RETRY_DELAY_SECONDS": 1,
"JURISDICTION": "eu",
"DATADOG_TAGS": {
"env": "production",
"team": "platform",
"service": "containers"
}
}
```

These tags will be added to all health and resource metrics sent to Datadog.

### Verify

```bash
Expand Down Expand Up @@ -123,11 +146,21 @@ See [Datadog's documentation](https://docs.datadoghq.com/dashboards/configure/#c

## Workflow Behavior

The exporter runs as a Cloudflare Workflow triggered every minute via cron. Each workflow step uses the default retry configuration:
The exporter runs as a Cloudflare Workflow triggered every minute via cron. Each workflow step uses configurable retry settings:

- **Retries**: 3 attempts
- **Delay**: 1 second initial delay
- **Backoff**: Exponential (1s, 2s, 4s)
- **Retries**: Configurable via `RETRY_LIMIT` (default: 3 attempts)
- **Delay**: Configurable via `RETRY_DELAY_SECONDS` (default: 1 second initial delay)
- **Backoff**: Exponential (e.g., 1s, 2s, 4s)

Steps will automatically retry on transient failures (API errors, network issues).

### Configuration Options

| Variable | Type | Default | Description |
|----------|------|---------|-------------|
| `BATCH_SIZE` | number | 5000 | Maximum metrics per Datadog API request |
| `RETRY_LIMIT` | number | 3 | Number of retry attempts for failed workflow steps |
| `RETRY_DELAY_SECONDS` | number | 1 | Initial delay in seconds before retry (exponential backoff) |
| `JURISDICTION` | string | "" | Durable Object jurisdiction for GraphQL queries (e.g., "eu", "fedramp") |
| `DATADOG_TAGS` | object | {} | Custom tags to add to all metrics |

49 changes: 23 additions & 26 deletions src/metrics.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
import { z } from "zod/v4";
import type { DatadogMetric } from "./api/datadog";
import type { Container, MetricsGroup } from "./types";

export interface ContainerWithMetrics {
container: Container;
metrics: MetricsGroup[];
}

export interface ContainerInfo {
id: string;
name: string;
version: number;
}

const DatadogTagsSchema = z.record(z.string(), z.string()).optional();

function parseCustomTags(datadogTags: unknown): string[] {
const parsed = DatadogTagsSchema.safeParse(datadogTags);
if (!parsed.success) {
console.warn("Invalid DATADOG_TAGS format, ignoring custom tags", {
error: parsed.error.message,
});
return [];
}
if (!parsed.data) {
return [];
}
return Object.entries(parsed.data).map(([key, value]) => `${key}:${value}`);
}

/**
* Format metrics for a single container into Datadog metrics
*/
Expand All @@ -20,7 +32,9 @@ export function formatMetricsForContainer(
container: ContainerInfo,
metricsGroups: MetricsGroup[],
timestamp?: number,
datadogTags?: unknown,
): DatadogMetric[] {
const customTags = parseCustomTags(datadogTags);
const ts = timestamp ?? Math.floor(Date.now() / 1000);
const metrics: DatadogMetric[] = [];

Expand All @@ -32,6 +46,7 @@ export function formatMetricsForContainer(
`version:${container.version}`,
`instance_id:${group.dimensions.deploymentId}`,
`placement_id:${group.dimensions.placementId}`,
...customTags,
];

// CPU metrics
Expand Down Expand Up @@ -138,36 +153,18 @@ export function formatMetricsForContainer(
return metrics;
}

/**
* Format container metrics data into Datadog metrics
*/
export function formatContainerMetrics(
accountId: string,
containersWithMetrics: ContainerWithMetrics[],
timestamp?: number,
): DatadogMetric[] {
const ts = timestamp ?? Math.floor(Date.now() / 1000);
const metrics: DatadogMetric[] = [];

for (const { container, metrics: groups } of containersWithMetrics) {
metrics.push(
...formatMetricsForContainer(accountId, container, groups, ts),
);
}

return metrics;
}

/**
* Format container health data into Datadog metrics
*/
export function formatHealthMetrics(
accountId: string,
containers: Container[],
timestamp?: number,
datadogTags?: unknown,
): DatadogMetric[] {
const customTags = parseCustomTags(datadogTags);
const ts = timestamp ?? Math.floor(Date.now() / 1000);
const baseTags = [`account_id:${accountId}`];
const baseTags = [`account_id:${accountId}`, ...customTags];
const metrics: DatadogMetric[] = [];

const totals = {
Expand Down
30 changes: 18 additions & 12 deletions src/workflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,19 @@ function getMetricsTimeWindow(now: Date = new Date()): {
return { start, end };
}

const STEP_CONFIG = {
retries: {
limit: 3,
delay: "1 second" as const,
backoff: "exponential" as const,
},
};

export class MetricsExporterWorkflow extends WorkflowEntrypoint<Env> {
async run(_event: WorkflowEvent<unknown>, step: WorkflowStep) {
const batchSize = Number.parseInt(this.env.BATCH_SIZE || "5000", 10);
const batchSize = this.env.BATCH_SIZE ?? 5000;
const retryLimit = this.env.RETRY_LIMIT ?? 3;
const retryDelaySeconds = this.env.RETRY_DELAY_SECONDS ?? 1;

const stepConfig = {
retries: {
limit: retryLimit,
delay: `${retryDelaySeconds} seconds` as const,
backoff: "exponential" as const,
},
};

// Create a fetcher that proxies requests through a Durable Object in a specific jurisdiction
// This ensures GraphQL queries run close to the data source
Expand Down Expand Up @@ -70,14 +72,16 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint<Env> {

const containers = await step.do(
"fetch containers",
STEP_CONFIG,
stepConfig,
async () => {
const result = await cloudflare.listContainers();
console.log("Fetched containers", { count: result.length });

const healthMetrics = formatHealthMetrics(
this.env.CLOUDFLARE_ACCOUNT_ID,
result,
undefined,
this.env.DATADOG_TAGS,
);
await datadog.sendMetrics(healthMetrics);

Expand All @@ -94,7 +98,7 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint<Env> {
for (const container of containers) {
const count = await step.do(
`Download Metrics: ${container.name}`,
STEP_CONFIG,
stepConfig,
async () => {
const metricsGroups = await cloudflare.getContainerMetrics(
container.id,
Expand All @@ -106,6 +110,8 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint<Env> {
this.env.CLOUDFLARE_ACCOUNT_ID,
container,
metricsGroups,
undefined,
this.env.DATADOG_TAGS,
);

const batches = chunk(metrics, batchSize);
Expand All @@ -115,7 +121,7 @@ export class MetricsExporterWorkflow extends WorkflowEntrypoint<Env> {
(batch, i) => () =>
step.do(
`Export Metrics: ${container.name} batch ${i + 1}/${batches.length}`,
STEP_CONFIG,
stepConfig,
async () => {
await datadog.sendMetrics(batch);
},
Expand Down
Loading