Skip to content

Commit 321a936

Browse files
committed
Align on p50 rather than avg for all instance metrics
1 parent 1eb64bc commit 321a936

File tree

7 files changed

+52
-63
lines changed

7 files changed

+52
-63
lines changed

datadog-dashboard.json

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@
113113
{
114114
"id": 21,
115115
"definition": {
116-
"title": "CPU Load by Application (Avg)",
116+
"title": "CPU Load by Application (P50)",
117117
"title_size": "16",
118118
"title_align": "left",
119119
"show_legend": true,
@@ -127,7 +127,7 @@
127127
{
128128
"name": "query1",
129129
"data_source": "metrics",
130-
"query": "avg:cloudflare.containers.cpu{stat:avg,$application_name} by {application_name}"
130+
"query": "avg:cloudflare.containers.cpu{stat:p50,$application_name} by {application_name}"
131131
}
132132
],
133133
"response_format": "timeseries",
@@ -193,7 +193,7 @@
193193
{
194194
"name": "query1",
195195
"data_source": "metrics",
196-
"query": "avg:cloudflare.containers.cpu{stat:avg,$application_name} by {application_name}",
196+
"query": "avg:cloudflare.containers.cpu{stat:p50,$application_name} by {application_name}",
197197
"aggregator": "avg"
198198
}
199199
],
@@ -217,7 +217,7 @@
217217
{
218218
"name": "query1",
219219
"data_source": "metrics",
220-
"query": "avg:cloudflare.containers.cpu{stat:avg,$application_name} by {placement_id}"
220+
"query": "avg:cloudflare.containers.cpu{stat:p50,$application_name} by {placement_id}"
221221
}
222222
],
223223
"response_format": "timeseries",
@@ -241,7 +241,7 @@
241241
{
242242
"id": 31,
243243
"definition": {
244-
"title": "Memory Usage by Application (Avg)",
244+
"title": "Memory Usage by Application (P50)",
245245
"title_size": "16",
246246
"title_align": "left",
247247
"show_legend": true,
@@ -255,7 +255,7 @@
255255
{
256256
"name": "query1",
257257
"data_source": "metrics",
258-
"query": "avg:cloudflare.containers.memory{stat:avg,$application_name} by {application_name}"
258+
"query": "avg:cloudflare.containers.memory{stat:p50,$application_name} by {application_name}"
259259
}
260260
],
261261
"response_format": "timeseries",
@@ -321,7 +321,7 @@
321321
{
322322
"name": "query1",
323323
"data_source": "metrics",
324-
"query": "avg:cloudflare.containers.memory{stat:avg,$application_name} by {application_name}",
324+
"query": "avg:cloudflare.containers.memory{stat:p50,$application_name} by {application_name}",
325325
"aggregator": "avg"
326326
}
327327
],
@@ -345,7 +345,7 @@
345345
{
346346
"name": "query1",
347347
"data_source": "metrics",
348-
"query": "avg:cloudflare.containers.memory{stat:avg,$application_name} by {placement_id}"
348+
"query": "avg:cloudflare.containers.memory{stat:p50,$application_name} by {placement_id}"
349349
}
350350
],
351351
"response_format": "timeseries",
@@ -557,20 +557,20 @@
557557
{
558558
"formulas": [
559559
{
560-
"alias": "CPU Avg",
560+
"alias": "CPU P50",
561561
"formula": "query1",
562562
"limit": { "count": 50, "order": "desc" }
563563
},
564564
{ "alias": "CPU P90", "formula": "query2" },
565-
{ "alias": "Memory Avg (GB)", "formula": "query3 / 1000000000" },
565+
{ "alias": "Memory P50 (GB)", "formula": "query3 / 1000000000" },
566566
{ "alias": "Memory P90 (GB)", "formula": "query4 / 1000000000" },
567567
{ "alias": "Disk P90 (GB)", "formula": "query5 / 1000000000" }
568568
],
569569
"queries": [
570570
{
571571
"name": "query1",
572572
"data_source": "metrics",
573-
"query": "avg:cloudflare.containers.cpu{stat:avg,$application_name} by {application_name}",
573+
"query": "avg:cloudflare.containers.cpu{stat:p50,$application_name} by {application_name}",
574574
"aggregator": "avg"
575575
},
576576
{
@@ -582,7 +582,7 @@
582582
{
583583
"name": "query3",
584584
"data_source": "metrics",
585-
"query": "avg:cloudflare.containers.memory{stat:avg,$application_name} by {application_name}",
585+
"query": "avg:cloudflare.containers.memory{stat:p50,$application_name} by {application_name}",
586586
"aggregator": "avg"
587587
},
588588
{

readme.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,18 @@ Use `instances.total.max - instances.total.healthy` to calculate available capac
8484

8585
**Tags:** `account_id`, `application_id`, `application_name`, `deployment_id`, `placement_id`, `stat`
8686

87-
The `stat` tag indicates the aggregation: `avg`, `p90`, `p99`, `max` (bandwidth metrics don't have a stat tag).
87+
The `stat` tag indicates the aggregation: `p50`, `p90`, `p99`, `max` (bandwidth metrics don't have a stat tag).
88+
89+
## Datadog Dashboard
90+
91+
A pre-built dashboard is included in `datadog-dashboard.json`. To import it:
92+
93+
1. In Datadog, go to **Dashboards****New Dashboard****New Dashboard**
94+
2. Click the cog icon (⚙️) in the top right
95+
3. Select **Import dashboard JSON**
96+
4. Paste the contents of `datadog-dashboard.json`
97+
98+
See [Datadog's documentation](https://docs.datadoghq.com/dashboards/configure/#copy-import-or-export-dashboard-json) for more details.
8899

89100
## Workflow Behavior
90101

@@ -94,4 +105,5 @@ The exporter runs as a Cloudflare Workflow triggered every minute via cron. Each
94105
- **Delay**: 1 second initial delay
95106
- **Backoff**: Exponential (1s, 2s, 4s)
96107

97-
Steps will automatically retry on transient failures (API errors, network issues).
108+
Steps will automatically retry on transient failures (API errors, network issues).
109+

src/api/cloudflare.ts

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,6 @@ query GetCloudchamberMetrics($accountTag: string!, $datetimeStart: Time, $dateti
2727
viewer {
2828
accounts(filter: {accountTag: $accountTag}) {
2929
cloudchamberMetricsAdaptiveGroups(limit: 10000, filter: {applicationId_in: $applicationIds, datetimeMinute_geq: $datetimeStart, datetimeMinute_leq: $datetimeEnd}) {
30-
avg {
31-
memory
32-
cpuLoad
33-
}
3430
max {
3531
memory
3632
cpuLoad

src/metrics.ts

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ export function formatMetricsForContainer(
3737
{
3838
metric: "cloudflare.containers.cpu",
3939
type: "gauge",
40-
points: [[ts, group.avg.cpuLoad]],
41-
tags: [...baseTags, "stat:avg"],
40+
points: [[ts, group.quantiles.cpuLoadP50]],
41+
tags: [...baseTags, "stat:p50"],
4242
},
4343
{
4444
metric: "cloudflare.containers.cpu",
@@ -65,8 +65,8 @@ export function formatMetricsForContainer(
6565
{
6666
metric: "cloudflare.containers.memory",
6767
type: "gauge",
68-
points: [[ts, group.avg.memory]],
69-
tags: [...baseTags, "stat:avg"],
68+
points: [[ts, group.quantiles.memoryP50]],
69+
tags: [...baseTags, "stat:p50"],
7070
},
7171
{
7272
metric: "cloudflare.containers.memory",
@@ -90,6 +90,12 @@ export function formatMetricsForContainer(
9090

9191
// Disk metrics
9292
metrics.push(
93+
{
94+
metric: "cloudflare.containers.disk",
95+
type: "gauge",
96+
points: [[ts, group.quantiles.diskUsageP50]],
97+
tags: [...baseTags, "stat:p50"],
98+
},
9399
{
94100
metric: "cloudflare.containers.disk",
95101
type: "gauge",

src/types.ts

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,6 @@ export const Container = z.looseObject({
2828

2929
// GraphQL Metrics Response Schemas
3030

31-
const MetricsAvgSchema = z.object({
32-
cpuLoad: z.number(),
33-
memory: z.number(),
34-
});
35-
3631
const MetricsMaxSchema = z.object({
3732
cpuLoad: z.number(),
3833
memory: z.number(),
@@ -66,7 +61,6 @@ const MetricsSumSchema = z.object({
6661
/** Metrics group from GraphQL API */
6762
export type MetricsGroup = z.infer<typeof MetricsGroup>;
6863
export const MetricsGroup = z.object({
69-
avg: MetricsAvgSchema,
7064
max: MetricsMaxSchema,
7165
dimensions: MetricsDimensionsSchema,
7266
quantiles: MetricsQuantilesSchema,

test/metricsformatting.test.ts

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ describe("formatMetricsForContainer", () => {
2424
TEST_TIMESTAMP,
2525
);
2626

27-
// 4 CPU + 4 Memory + 3 Disk + 2 Bandwidth = 13 metrics per group
28-
expect(metrics).toHaveLength(13);
27+
// 4 CPU + 4 Memory + 4 Disk + 2 Bandwidth = 14 metrics per group
28+
expect(metrics).toHaveLength(14);
2929
});
3030

3131
it("formats multiple metrics groups", () => {
@@ -37,8 +37,8 @@ describe("formatMetricsForContainer", () => {
3737
TEST_TIMESTAMP,
3838
);
3939

40-
// 2 groups * 13 metrics = 26 metrics
41-
expect(metrics).toHaveLength(26);
40+
// 2 groups * 14 metrics = 28 metrics
41+
expect(metrics).toHaveLength(28);
4242
});
4343

4444
it("includes correct tags", () => {
@@ -61,7 +61,7 @@ describe("formatMetricsForContainer", () => {
6161

6262
const cpuMetric = metrics.find(
6363
(m) =>
64-
m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:avg"),
64+
m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"),
6565
);
6666

6767
expect(cpuMetric).toBeDefined();
@@ -99,8 +99,8 @@ describe("formatContainerMetrics", () => {
9999
TEST_TIMESTAMP,
100100
);
101101

102-
// 4 CPU + 4 Memory + 3 Disk + 2 Bandwidth = 13 metrics per group
103-
expect(metrics).toHaveLength(13);
102+
// 4 CPU + 4 Memory + 4 Disk + 2 Bandwidth = 14 metrics per group
103+
expect(metrics).toHaveLength(14);
104104
});
105105

106106
it("formats metrics for multiple containers with multiple groups", () => {
@@ -121,8 +121,8 @@ describe("formatContainerMetrics", () => {
121121
TEST_TIMESTAMP,
122122
);
123123

124-
// 3 groups * 13 metrics = 39 metrics
125-
expect(metrics).toHaveLength(39);
124+
// 3 groups * 14 metrics = 42 metrics
125+
expect(metrics).toHaveLength(42);
126126
});
127127

128128
it("includes correct tags for each metric", () => {
@@ -150,7 +150,7 @@ describe("formatContainerMetrics", () => {
150150

151151
const cpuMetric = metrics.find(
152152
(m) =>
153-
m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:avg"),
153+
m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"),
154154
);
155155

156156
expect(cpuMetric).toBeDefined();
@@ -161,20 +161,14 @@ describe("formatContainerMetrics", () => {
161161
);
162162
expect(cpuMetric?.tags).toContain("deployment_id:deploy-test");
163163
expect(cpuMetric?.tags).toContain("placement_id:place-test");
164-
expect(cpuMetric?.tags).toContain("stat:avg");
164+
expect(cpuMetric?.tags).toContain("stat:p50");
165165
});
166166

167167
it("uses correct metric values from the group", () => {
168168
const group = createMockMetricsGroup({
169-
avg: {
170-
cpuLoad: 0.42,
171-
memory: 123456789,
172-
rxBandwidthBps: 0,
173-
txBandwidthBps: 0,
174-
},
175169
max: { cpuLoad: 0.99, memory: 999999999, diskUsage: 5000000000 },
176170
quantiles: {
177-
cpuLoadP50: 0.3,
171+
cpuLoadP50: 0.42,
178172
cpuLoadP90: 0.8,
179173
cpuLoadP99: 0.95,
180174
memoryP50: 100000000,
@@ -200,12 +194,12 @@ describe("formatContainerMetrics", () => {
200194
TEST_TIMESTAMP,
201195
);
202196

203-
// Check CPU avg
204-
const cpuAvg = metrics.find(
197+
// Check CPU p50
198+
const cpuP50 = metrics.find(
205199
(m) =>
206-
m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:avg"),
200+
m.metric === "cloudflare.containers.cpu" && m.tags.includes("stat:p50"),
207201
);
208-
expect(cpuAvg?.points[0]).toEqual([TEST_TIMESTAMP, 0.42]);
202+
expect(cpuP50?.points[0]).toEqual([TEST_TIMESTAMP, 0.42]);
209203

210204
// Check CPU max
211205
const cpuMax = metrics.find(

test/mocks/data.ts

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,6 @@ export const mockContainers: Container[] = [
5151
*/
5252
export const mockMetricsGroups: MetricsGroup[] = [
5353
{
54-
avg: {
55-
cpuLoad: 0.25,
56-
memory: 268435456, // 256 MB
57-
},
5854
max: {
5955
cpuLoad: 0.75,
6056
memory: 402653184, // 384 MB
@@ -83,10 +79,6 @@ export const mockMetricsGroups: MetricsGroup[] = [
8379
},
8480
},
8581
{
86-
avg: {
87-
cpuLoad: 0.15,
88-
memory: 134217728, // 128 MB
89-
},
9082
max: {
9183
cpuLoad: 0.45,
9284
memory: 201326592, // 192 MB
@@ -149,11 +141,6 @@ export function createMockMetricsGroup(
149141
overrides: Partial<MetricsGroup> = {},
150142
): MetricsGroup {
151143
return {
152-
avg: {
153-
cpuLoad: 0.25,
154-
memory: 268435456,
155-
...overrides.avg,
156-
},
157144
max: {
158145
cpuLoad: 0.75,
159146
memory: 402653184,

0 commit comments

Comments
 (0)