Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 69 additions & 25 deletions src/rust/encoding/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,13 @@
//! Exposes a streaming decoder to C++ via CXX bridge. All legacy encodings
//! (CJK multi-byte, single-byte windows-1252, and x-user-defined) are handled
//! by a single opaque `Decoder` type backed by `encoding_rs::Decoder`.
//!
//! The output buffer is owned by the `Decoder` and reused across calls to
//! avoid repeated heap allocations. C++ reads the decoded UTF-16 data via
//! the pointer and length returned in `DecodeResult`.

#[cxx::bridge(namespace = "workerd::rust::encoding")]
#[expect(clippy::elidable_lifetime_names)] // CXX bridge requires named lifetimes on shared structs
mod ffi {
/// Legacy encoding types supported by the Rust decoder.
/// Shared between C++ and Rust.
Expand All @@ -26,10 +31,13 @@ mod ffi {
XUserDefined,
}

/// Result of a decode operation.
struct DecodeResult {
/// UTF-16 output.
output: Vec<u16>,
/// Result of a decode operation. The output slice borrows the
/// decoder's internal buffer and is valid until the next `decode` or
/// `reset` call.
struct DecodeResult<'a> {
/// UTF-16 code units decoded from the input, borrowing the
/// decoder's reusable output buffer.
output: &'a [u16],
/// True if a fatal decoding error was encountered. Only meaningful
/// when the caller requested fatal mode — in replacement mode errors
/// are silently replaced with U+FFFD and this flag is not set.
Expand All @@ -49,12 +57,18 @@ mod ffi {
#[expect(clippy::unnecessary_box_returns)]
fn new_decoder(encoding: Encoding) -> Box<Decoder>;

/// Decode a chunk of bytes. Set `flush` to true on the final chunk.
/// When `fatal` is true and an error is encountered, `had_error` is
/// set and the output may be incomplete.
fn decode(decoder: &mut Decoder, input: &[u8], options: &DecodeOptions) -> DecodeResult;

/// Reset the decoder to its initial state.
/// Decode a chunk of bytes. The decoded UTF-16 output is stored in
/// the decoder's internal buffer; the returned `DecodeResult`
/// borrows that buffer. Set `flush` to true on the final chunk.
/// When `fatal` is true and an error is encountered, `had_error`
/// is set and the output may be incomplete.
unsafe fn decode<'a>(
decoder: &'a mut Decoder,
input: &[u8],
options: &DecodeOptions,
) -> DecodeResult<'a>;

/// Reset the decoder to its initial state (for explicit reset calls).
fn reset(decoder: &mut Decoder);
}
}
Expand All @@ -63,6 +77,11 @@ mod ffi {
pub struct Decoder {
encoding: &'static encoding_rs::Encoding,
inner: encoding_rs::Decoder,
/// Reusable output buffer — kept across calls to avoid allocation.
output: Vec<u16>,
/// Set after a flush decode; checked at the start of the next decode
/// to lazily reconstruct the inner decoder.
needs_reset: bool,
}

/// Map a CXX-shared `Encoding` variant to the corresponding
Expand All @@ -87,30 +106,40 @@ pub fn new_decoder(encoding: ffi::Encoding) -> Box<Decoder> {
Box::new(Decoder {
inner: encoding.new_decoder_without_bom_handling(),
encoding,
output: Vec::new(),
needs_reset: false,
})
}

pub fn decode(
state: &mut Decoder,
pub fn decode<'a>(
state: &'a mut Decoder,
input: &[u8],
options: &ffi::DecodeOptions,
) -> ffi::DecodeResult {
// max_utf16_buffer_length() returns None on usize overflow. The +4 covers extra
// UTF-16 code units from decoder state. Safe even if slightly short since the decode loop
// below resizes on OutputFull.
) -> ffi::DecodeResult<'a> {
// Lazy reset: reconstruct the inner decoder only when a previous flush
// marked it as needed, avoiding the cost on one-shot decodes where the
// decoder is never reused.
if state.needs_reset {
state.inner = state.encoding.new_decoder_without_bom_handling();
state.needs_reset = false;
}

// Reuse the output buffer — clear length but keep the allocation.
state.output.clear();
let max_len = state
.inner
.max_utf16_buffer_length(input.len())
.unwrap_or(input.len() + 4);
let mut output = vec![0u16; max_len];
state.output.resize(max_len, 0);

let mut total_read = 0usize;
let mut total_written = 0usize;

if options.fatal {
loop {
let (result, read, written) = state.inner.decode_to_utf16_without_replacement(
&input[total_read..],
&mut output[total_written..],
&mut state.output[total_written..],
options.flush,
);
total_read += read;
Expand All @@ -119,13 +148,15 @@ pub fn decode(
match result {
encoding_rs::DecoderResult::InputEmpty => break,
encoding_rs::DecoderResult::OutputFull => {
output.resize(output.len() * 2, 0);
state.output.resize(state.output.len() * 2, 0);
}
encoding_rs::DecoderResult::Malformed(_, _) => {
// Reset immediately on fatal error so the decoder is
// ready for a fresh sequence if reused.
state.inner = state.encoding.new_decoder_without_bom_handling();
output.truncate(total_written);
state.output.truncate(total_written);
return ffi::DecodeResult {
output,
output: &state.output,
had_error: true,
};
}
Expand All @@ -135,7 +166,7 @@ pub fn decode(
loop {
let (result, read, written, _had_errors) = state.inner.decode_to_utf16(
&input[total_read..],
&mut output[total_written..],
&mut state.output[total_written..],
options.flush,
);
total_read += read;
Expand All @@ -144,19 +175,32 @@ pub fn decode(
match result {
encoding_rs::CoderResult::InputEmpty => break,
encoding_rs::CoderResult::OutputFull => {
output.resize(output.len() * 2, 0);
state.output.resize(state.output.len() * 2, 0);
}
}
}
}

output.truncate(total_written);
state.output.truncate(total_written);

if options.flush {
// Defer the actual reset to the next decode() call.
state.needs_reset = true;
}

ffi::DecodeResult {
output,
output: &state.output,
had_error: false,
}
}

pub fn reset(state: &mut Decoder) {
state.inner = state.encoding.new_decoder_without_bom_handling();
state.needs_reset = false;
// Intentionally keep state.output — preserves the allocation for reuse.
// The buffer can grow up to ~2× the largest input chunk (due to UTF-16
// expansion and the doubling strategy in decode()) and stays at that high-
// water mark. This is acceptable because the Decoder is owned by a JS
// TextDecoder object and is GC'd with it, so the buffer lifetime is
// bounded by the object's reachability.
}
13 changes: 4 additions & 9 deletions src/workerd/api/encoding-legacy.c++
Original file line number Diff line number Diff line change
Expand Up @@ -58,24 +58,19 @@ kj::Maybe<jsg::JsString> LegacyDecoder::decode(
// https://github.com/hsivonen/encoding_rs/issues/126#issuecomment-3677642122
return js.str();
}
// Reset decoder state after flush, matching IcuDecoder's KJ_DEFER contract.
// This ensures decodePtr() (used by TextDecoderStream) resets correctly on flush.
KJ_DEFER({
if (flush) reset();
});

::workerd::rust::encoding::DecodeOptions options{.flush = flush, .fatal = fatal.toBool()};
// kj_rs::RustMutable is used to avoid a copy of the underlying buffer.
// Decode into the Rust-side reusable buffer. The Rust decoder handles
// lazy reset internally when a previous call used flush=true.
auto result =
::workerd::rust::encoding::decode(*state, buffer.as<kj_rs::RustMutable>(), kj::mv(options));

if (fatal.toBool() && result.had_error) {
// Decoder state already reset by the Rust side on fatal error.
return kj::none;
}

auto output = kj::from<kj_rs::Rust>(result.output);
return js.str(output);
// Zero-copy view of the UTF-16 output slice from the Rust-owned buffer.
return js.str(kj::from<kj_rs::Rust>(result.output));
}

} // namespace workerd::api
Loading