Skip to content

Commit 9fff567

Browse files
committed
tweak of thresholds, based on benchmarks
1 parent 77da497 commit 9fff567

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

src/utils/utf8.ts

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,15 +98,18 @@ const sharedTextEncoder = new TextEncoder();
9898

9999
// This threshold should be determined by benchmarking, which might vary in engines and input data.
100100
// Run `npx ts-node benchmark/encode-string.ts` for details.
101+
// For mixed content (ASCII + CJK + emoji), JS wins for strLength < 30-50.
102+
// After that, WASM or TextEncoder is faster depending on content type.
101103
const TEXT_ENCODER_THRESHOLD = 50;
102104

103105
export function utf8EncodeTE(str: string, output: Uint8Array, outputOffset: number): void {
104106
sharedTextEncoder.encodeInto(str, output.subarray(outputOffset));
105107
}
106108

107-
// Wasm threshold: use wasm for medium strings, TextEncoder for large strings
108-
// These thresholds should be determined by benchmarking.
109-
// Run `npx ts-node benchmark/encode-string.ts` for details.
109+
// Wasm threshold: use wasm for medium strings, TextEncoder for large strings.
110+
// For pure ASCII, TextEncoder is ~1.7x faster at 100+ strLength.
111+
// For CJK/emoji, WASM is ~1.4-1.6x faster than TextEncoder at all sizes.
112+
// 1000 is a compromise for mixed content.
110113
const WASM_ENCODE_MAX = 1000;
111114

112115
function utf8EncodeWithWasm(str: string, output: Uint8Array, outputOffset: number): void {
@@ -187,14 +190,19 @@ const sharedTextDecoder = new TextDecoder();
187190

188191
// This threshold should be determined by benchmarking, which might vary in engines and input data.
189192
// Run `npx ts-node benchmark/decode-string.ts` for details.
190-
const TEXT_DECODER_THRESHOLD = 200;
193+
// For mixed content (ASCII + CJK + emoji), JS wins for very short strings only.
194+
// WASM becomes superior at ~30-50 bytes for non-ASCII content.
195+
const TEXT_DECODER_THRESHOLD = 50;
191196

192197
export function utf8DecodeTD(bytes: Uint8Array, inputOffset: number, byteLength: number): string {
193198
const stringBytes = bytes.subarray(inputOffset, inputOffset + byteLength);
194199
return sharedTextDecoder.decode(stringBytes);
195200
}
196201

197-
// Wasm decode threshold: use wasm for medium strings, TextDecoder for large strings
202+
// Wasm decode threshold: use wasm for medium strings, TextDecoder for large strings.
203+
// For pure ASCII, TextDecoder is ~5x faster at 1000+ bytes.
204+
// For CJK/emoji, WASM is ~5-6x faster than TextDecoder at all sizes.
205+
// 1000 is a compromise for mixed content.
198206
const WASM_DECODE_MAX = 1000;
199207

200208
function utf8DecodeWithWasm(bytes: Uint8Array, inputOffset: number, byteLength: number): string {

0 commit comments

Comments
 (0)