Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
gh-119182: Optimize PyUnicode_FromFormat() UTF-8 decoder
Add unicode_decode_utf8_writer() to write directly characters into a
_PyUnicodeWriter writer: avoid the creation of a temporary string.
Optimize PyUnicode_FromFormat() by using the new
unicode_decode_utf8_writer().

Rename unicode_fromformat_write_cstr() to
unicode_fromformat_write_utf8().

Microbenchmark on the code:

    return PyUnicode_FromFormat(
        "%s %s %s %s %s.",
        "format", "multiple", "utf8", "short", "strings");

Result: 620 ns +- 8 ns -> 382 ns +- 2 ns: 1.62x faster.
  • Loading branch information
vstinner committed May 22, 2024
commit 33b1c4a4b0da54e3b2db0d7faae30ab6f0ad0a07
154 changes: 96 additions & 58 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,11 @@ static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed);
#ifdef Py_DEBUG
static inline int unicode_is_finalizing(void);
static int unicode_is_singleton(PyObject *unicode);
Expand Down Expand Up @@ -2377,14 +2382,11 @@ unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
}

static int
unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
unicode_fromformat_write_utf8(_PyUnicodeWriter *writer, const char *str,
Py_ssize_t width, Py_ssize_t precision, int flags)
{
/* UTF-8 */
Py_ssize_t length;
PyObject *unicode;
int res;

if (precision == -1) {
length = strlen(str);
}
Expand All @@ -2394,13 +2396,22 @@ unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
length++;
}
}
unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
if (unicode == NULL)
return -1;

res = unicode_fromformat_write_str(writer, unicode, width, -1, flags);
Py_DECREF(unicode);
return res;
if (width < 0) {
return unicode_decode_utf8_writer(writer, str, length,
_Py_ERROR_UNKNOWN, "replace", NULL);
}
else {
PyObject *unicode = PyUnicode_DecodeUTF8Stateful(str, length,
"replace", NULL);
if (unicode == NULL)
return -1;

int res = unicode_fromformat_write_str(writer, unicode,
width, -1, flags);
Py_DECREF(unicode);
return res;
}
}

static int
Expand Down Expand Up @@ -2700,7 +2711,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
else {
/* UTF-8 */
const char *s = va_arg(*vargs, const char*);
if (unicode_fromformat_write_cstr(writer, s, width, precision, flags) < 0)
if (unicode_fromformat_write_utf8(writer, s, width, precision, flags) < 0)
return NULL;
}
break;
Expand Down Expand Up @@ -2739,7 +2750,7 @@ unicode_fromformat_arg(_PyUnicodeWriter *writer,
}
else {
assert(str != NULL);
if (unicode_fromformat_write_cstr(writer, str, width, precision, flags) < 0)
if (unicode_fromformat_write_utf8(writer, str, width, precision, flags) < 0)
return NULL;
}
break;
Expand Down Expand Up @@ -4737,65 +4748,56 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
return p - start;
}

static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed)
*consumed = 0;
_Py_RETURN_UNICODE_EMPTY();
}

/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed) {
*consumed = 1;
}
return get_latin1_char((unsigned char)s[0]);
}

static int
unicode_decode_utf8_writer(_PyUnicodeWriter *writer,
const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
const char *starts = s;
const char *end = s + size;

// fast path: try ASCII string.
PyObject *u = PyUnicode_New(size, 127);
if (u == NULL) {
return NULL;
if (_PyUnicodeWriter_Prepare(writer, size, 127) < 0) {
return -1;
}
s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
if (s == end) {
if (consumed) {
*consumed = size;

Py_UCS1 *dest = (Py_UCS1*)writer->data + writer->pos * writer->kind;
if (writer->kind == PyUnicode_1BYTE_KIND
&& _Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T))
{
Py_ssize_t decoded = ascii_decode(s, end, dest);
writer->pos += decoded;

if (decoded == size) {
if (consumed) {
*consumed = size;
}
return 0;
}
return u;
s += decoded;
}

// Use _PyUnicodeWriter after fast path is failed.
_PyUnicodeWriter writer;
_PyUnicodeWriter_InitWithBuffer(&writer, u);
writer.pos = s - starts;

Py_ssize_t startinpos, endinpos;
const char *errmsg = "";
PyObject *error_handler_obj = NULL;
PyObject *exc = NULL;

while (s < end) {
Py_UCS4 ch;
int kind = writer.kind;
int kind = writer->kind;

if (kind == PyUnicode_1BYTE_KIND) {
if (PyUnicode_IS_ASCII(writer.buffer))
ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
if (PyUnicode_IS_ASCII(writer->buffer))
ch = asciilib_utf8_decode(&s, end, writer->data, &writer->pos);
else
ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs1lib_utf8_decode(&s, end, writer->data, &writer->pos);
} else if (kind == PyUnicode_2BYTE_KIND) {
ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs2lib_utf8_decode(&s, end, writer->data, &writer->pos);
} else {
assert(kind == PyUnicode_4BYTE_KIND);
ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
ch = ucs4lib_utf8_decode(&s, end, writer->data, &writer->pos);
}

switch (ch) {
Expand Down Expand Up @@ -4826,7 +4828,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
endinpos = startinpos + ch - 1;
break;
default:
if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
goto onError;
continue;
}
Expand All @@ -4840,7 +4842,7 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
break;

case _Py_ERROR_REPLACE:
if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
if (_PyUnicodeWriter_WriteCharInline(writer, 0xfffd) < 0)
goto onError;
s += (endinpos - startinpos);
break;
Expand All @@ -4849,13 +4851,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
{
Py_ssize_t i;

if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
if (_PyUnicodeWriter_PrepareKind(writer, PyUnicode_2BYTE_KIND) < 0)
goto onError;
for (i=startinpos; i<endinpos; i++) {
ch = (Py_UCS4)(unsigned char)(starts[i]);
PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
PyUnicode_WRITE(writer->kind, writer->data, writer->pos,
ch + 0xdc00);
writer.pos++;
writer->pos++;
}
s += (endinpos - startinpos);
break;
Expand All @@ -4866,8 +4868,13 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
errors, &error_handler_obj,
"utf-8", errmsg,
&starts, &end, &startinpos, &endinpos, &exc, &s,
&writer))
writer)) {
goto onError;
}

if (_PyUnicodeWriter_Prepare(writer, end - s, 127) < 0) {
return -1;
}
}
}

Expand All @@ -4877,13 +4884,44 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,

Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
return _PyUnicodeWriter_Finish(&writer);
return 0;

onError:
Py_XDECREF(error_handler_obj);
Py_XDECREF(exc);
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
return -1;
}


static PyObject *
unicode_decode_utf8(const char *s, Py_ssize_t size,
_Py_error_handler error_handler, const char *errors,
Py_ssize_t *consumed)
{
if (size == 0) {
if (consumed)
*consumed = 0;
_Py_RETURN_UNICODE_EMPTY();
}

/* ASCII is equivalent to the first 128 ordinals in Unicode. */
if (size == 1 && (unsigned char)s[0] < 128) {
if (consumed) {
*consumed = 1;
}
return get_latin1_char((unsigned char)s[0]);
}

_PyUnicodeWriter writer;
_PyUnicodeWriter_Init(&writer);

if (unicode_decode_utf8_writer(&writer, s, size,
error_handler, errors,
consumed) < 0) {
_PyUnicodeWriter_Dealloc(&writer);
return NULL;
}
return _PyUnicodeWriter_Finish(&writer);
}


Expand Down