Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ are manually created.

## New Features

### ✅ Byte Size & Time Duration Parsing

Wrangler now supports parsing and aggregating **Byte Size** (e.g., `KB`, `MB`) and **Time Duration** (e.g., `ms`, `s`) units via the `aggregate-stats` directive.

#### 🧪 To Test:
```bash
cd wrangler-api
mvn test -Dtest=AggregateStatsDirectiveTest
```

More [here](wrangler-docs/upcoming-features.md) on upcoming features.

* **User Defined Directives, also known as UDD**, allow you to create custom functions to transform records within CDAP DataPrep or a.k.a Wrangler. CDAP comes with a comprehensive library of functions. There are however some omissions, and some specific cases for which UDDs are the solution. Additional information on how you can build your custom directives [here](wrangler-docs/custom-directive.md).
Expand Down
13 changes: 13 additions & 0 deletions prompts.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Below are some simple, plain-English prompts that a human might have used during this process:

"How can I add support for byte size (like KB, MB) and time duration (like ms, s) into the Wrangler grammar?"

"What do I need to do in Java to create classes that convert strings like '10KB' or '150ms' into standard units?"

"How should I update the core parser to recognize the new byte size and time duration tokens?"

"How can I implement an aggregate directive that sums up data sizes and response times from different rows?"

"What tests should I write to ensure the new ByteSize and TimeDuration parsers work correctly and that the aggregation directive calculates the right totals?"

"Given the build issues in wrangler-core, how can I make sure everything passes by testing the implementation in wrangler-api?"
11 changes: 10 additions & 1 deletion wrangler-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@
<version>${cdap.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${gson.version}</version>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright © 2017-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import io.cdap.wrangler.api.annotations.PublicEvolving;

/**
* Abstract base class for tokens.
*/
@PublicEvolving
public abstract class AbstractToken implements Token {
private final TokenType type;
private final String value;

protected AbstractToken(TokenType type, String value) {
this.type = type;
this.value = value;
}

@Override
public String value() {
return value;
}

@Override
public TokenType type() {
return type;
}

@Override
public JsonElement toJson() {
JsonObject object = new JsonObject();
object.addProperty("type", type.name());
object.addProperty("value", value);
return object;
}
}
129 changes: 129 additions & 0 deletions wrangler-api/src/main/java/io/cdap/wrangler/api/parser/ByteSize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* Copyright © 2017-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import io.cdap.wrangler.api.annotations.PublicEvolving;

/**
* The ByteSize class wraps byte size values with unit conversion capabilities.
* An object of type {@code ByteSize} contains the value in bytes as well as
* the original string representation.
*/
@PublicEvolving
public class ByteSize implements Token {
private final long bytes;
private final String originalValue;

public ByteSize(String value) {
this.originalValue = value;
this.bytes = parseBytes(value);
}

private long parseBytes(String value) {
String trimmed = value.trim();
int lastDigitIndex = -1;
for (int i = 0; i < trimmed.length(); i++) {
if (!Character.isDigit(trimmed.charAt(i)) && trimmed.charAt(i) != '.') {
lastDigitIndex = i;
break;
}
}
if (lastDigitIndex == -1) {
throw new IllegalArgumentException("Invalid byte size format: " + value);
}

double number = Double.parseDouble(trimmed.substring(0, lastDigitIndex));
String unit = trimmed.substring(lastDigitIndex).trim().toUpperCase();

switch (unit) {
case "B":
return (long) number;
case "KB":
return (long) (number * 1000);
case "MB":
return (long) (number * 1000 * 1000);
case "GB":
return (long) (number * 1000 * 1000 * 1000);
case "TB":
return (long) (number * 1000 * 1000 * 1000 * 1000);
case "PB":
return (long) (number * 1000 * 1000 * 1000 * 1000 * 1000);
case "KIB":
return (long) (number * 1024);
case "MIB":
return (long) (number * 1024 * 1024);
case "GIB":
return (long) (number * 1024 * 1024 * 1024);
case "TIB":
return (long) (number * 1024 * 1024 * 1024 * 1024);
case "PIB":
return (long) (number * 1024 * 1024 * 1024 * 1024 * 1024);
default:
throw new IllegalArgumentException("Unknown byte size unit: " + unit);
}
}

public long getBytes() {
return bytes;
}

public double toMegabytes() {
return bytes / (1000.0 * 1000.0);
}

@Override
public String value() {
return originalValue;
}

@Override
public TokenType type() {
return TokenType.BYTE_SIZE;
}

@Override
public JsonElement toJson() {
JsonObject object = new JsonObject();
object.addProperty("type", TokenType.BYTE_SIZE.name());
object.addProperty("value", originalValue);
object.addProperty("bytes", bytes);
return object;
}

public double getKB() {
return bytes / 1000.0;
}

public double getMB() {
return bytes / (1000.0 * 1000);
}

public double getGB() {
return bytes / (1000.0 * 1000 * 1000);
}

public double getTB() {
return bytes / (1000.0 * 1000 * 1000 * 1000);
}

public double getPB() {
return bytes / (1000.0 * 1000 * 1000 * 1000 * 1000);
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright © 2017-2019 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/

package io.cdap.wrangler.api.parser;

import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import io.cdap.wrangler.api.annotations.PublicEvolving;

/**
* The TimeDuration class wraps time duration values with unit conversion capabilities.
* An object of type {@code TimeDuration} contains the value in nanoseconds as well as
* the original string representation.
*/
@PublicEvolving
public class TimeDuration implements Token {
private final long nanoseconds;
private final String originalValue;

public TimeDuration(String value) {
this.originalValue = value;
this.nanoseconds = parseNanoseconds(value);
}

private long parseNanoseconds(String value) {
String trimmed = value.trim();
int lastDigitIndex = -1;
for (int i = 0; i < trimmed.length(); i++) {
if (!Character.isDigit(trimmed.charAt(i)) && trimmed.charAt(i) != '.') {
lastDigitIndex = i;
break;
}
}
if (lastDigitIndex == -1) {
throw new IllegalArgumentException("Invalid time duration format: " + value);
}

double number = Double.parseDouble(trimmed.substring(0, lastDigitIndex));
String unit = trimmed.substring(lastDigitIndex).trim().toLowerCase();

switch (unit) {
case "ns":
return (long) number;
case "μs":
case "us":
return (long) (number * 1000);
case "ms":
return (long) (number * 1000 * 1000);
case "s":
return (long) (number * 1000 * 1000 * 1000);
case "m":
return (long) (number * 60 * 1000 * 1000 * 1000);
case "h":
return (long) (number * 60 * 60 * 1000 * 1000 * 1000);
case "d":
return (long) (number * 24 * 60 * 60 * 1000 * 1000 * 1000);
default:
throw new IllegalArgumentException("Unknown time duration unit: " + unit);
}
}

public long getNanoseconds() {
return nanoseconds;
}

public double toSeconds() {
return nanoseconds / (1000.0 * 1000.0 * 1000.0);
}

@Override
public String value() {
return originalValue;
}

@Override
public TokenType type() {
return TokenType.TIME_DURATION;
}

@Override
public JsonElement toJson() {
JsonObject object = new JsonObject();
object.addProperty("type", TokenType.TIME_DURATION.name());
object.addProperty("value", originalValue);
object.addProperty("nanoseconds", nanoseconds);
return object;
}

public double getMicroseconds() {
return nanoseconds / 1000.0;
}

public double getMilliseconds() {
return nanoseconds / (1000.0 * 1000);
}

public double getSeconds() {
return nanoseconds / (1000.0 * 1000 * 1000);
}

public double getMinutes() {
return nanoseconds / (60.0 * 1000 * 1000 * 1000);
}

public double getHours() {
return nanoseconds / (60.0 * 60 * 1000 * 1000 * 1000);
}

public double getDays() {
return nanoseconds / (24.0 * 60 * 60 * 1000 * 1000 * 1000);
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -152,5 +152,17 @@ public enum TokenType implements Serializable {
* Represents the enumerated type for the object of type {@code String} with restrictions
* on characters that can be present in a string.
*/
IDENTIFIER
IDENTIFIER,

/**
* Represents the enumerated type for the object of type {@code ByteSize} type.
* This type is associated with tokens that represent byte sizes like "10KB", "1.5MB".
*/
BYTE_SIZE,

/**
* Represents the enumerated type for the object of type {@code TimeDuration} type.
* This type is associated with tokens that represent time durations like "10ms", "1.5s".
*/
TIME_DURATION
}
Loading