From 098ed41716b679d80789b73e726f0f36d430e82e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 9 May 2024 11:46:23 +0530 Subject: [PATCH] Add support for ANSI-C quoted strings to shlex --- kitty/fast_data_types.pyi | 2 +- kitty/shlex.c | 123 +++++++++++++++++++++++++++++++------- kitty/utils.py | 8 +-- kitty_tests/datatypes.py | 20 ++++++- 4 files changed, 127 insertions(+), 26 deletions(-) diff --git a/kitty/fast_data_types.pyi b/kitty/fast_data_types.pyi index 316e37d0d..68e990604 100644 --- a/kitty/fast_data_types.pyi +++ b/kitty/fast_data_types.pyi @@ -1527,7 +1527,7 @@ class AES256GCMDecrypt: class Shlex: - def __init__(self, src: str): ... + def __init__(self, src: str, allow_ansi_quoted_strings: bool = False): ... def next_word(self) -> Tuple[int, str]: ... diff --git a/kitty/shlex.c b/kitty/shlex.c index ef3086dbb..22e9764ad 100644 --- a/kitty/shlex.c +++ b/kitty/shlex.c @@ -7,13 +7,13 @@ #include "data-types.h" -typedef enum { NORMAL, WORD, STRING_WITHOUT_ESCAPES, STRING_WITH_ESCAPES, } State; +typedef enum { NORMAL, WORD, STRING_WITHOUT_ESCAPES, STRING_WITH_ESCAPES, ANSI_C_QUOTED } State; typedef struct { PyObject_HEAD PyObject *src, *buf; Py_ssize_t src_sz, src_pos, word_start, buf_pos; - int kind; void *src_data, *buf_data; + int kind, support_ansi_c_quoting, output_kind; void *src_data, *buf_data; State state; } Shlex; @@ -24,15 +24,17 @@ new_shlex_object(PyTypeObject *type, PyObject *args, PyObject UNUSED *kwds) { self = (Shlex *)type->tp_alloc(type, 0); if (self) { PyObject *src; - if (!PyArg_ParseTuple(args, "U", &src)) return NULL; + self->support_ansi_c_quoting = 0; + if (!PyArg_ParseTuple(args, "U|p", &src, &self->support_ansi_c_quoting)) return NULL; self->src_sz = PyUnicode_GET_LENGTH(src); - self->buf = PyUnicode_New(self->src_sz, PyUnicode_MAX_CHAR_VALUE(src)); + self->buf = PyUnicode_New(self->src_sz, self->support_ansi_c_quoting ? 1114111 : PyUnicode_MAX_CHAR_VALUE(src)); if (self->buf) { self->src = src; Py_INCREF(src); self->kind = PyUnicode_KIND(src); self->src_data = PyUnicode_DATA(src); self->buf_data = PyUnicode_DATA(self->buf); + self->output_kind = PyUnicode_KIND(self->buf); } else Py_CLEAR(self); } return (PyObject*) self; @@ -57,7 +59,7 @@ start_word(Shlex *self) { static void write_ch(Shlex *self, Py_UCS4 ch) { - PyUnicode_WRITE(self->kind, self->buf_data, self->buf_pos, ch); self->buf_pos++; + PyUnicode_WRITE(self->output_kind, self->buf_data, self->buf_pos, ch); self->buf_pos++; } static PyObject* @@ -66,16 +68,93 @@ get_word(Shlex *self) { return Py_BuildValue("nN", self->word_start, PyUnicode_Substring(self->buf, 0, pos)); } +static Py_UCS4 +read_ch(Shlex *self) { + Py_UCS4 nch = PyUnicode_READ(self->kind, self->src_data, self->src_pos); self->src_pos++; + return nch; +} + static bool write_escape_ch(Shlex *self) { if (self->src_pos < self->src_sz) { - Py_UCS4 nch = PyUnicode_READ(self->kind, self->src_data, self->src_pos); self->src_pos++; + Py_UCS4 nch = read_ch(self); write_ch(self, nch); return true; } return false; } +static bool +write_control_ch(Shlex *self) { + if (self->src_pos >= self->src_sz) { PyErr_SetString(PyExc_ValueError, "Trailing \\c escape at end of input data"); return false; } + Py_UCS4 ch = read_ch(self); + write_ch(self, ch & 31); + return true; +} + +static void +read_valid_digits(Shlex *self, int max, char *output, bool(*is_valid)(Py_UCS4 ch)) { + for (int i = 0; i < max && self->src_pos < self->src_sz; i++) { + Py_UCS4 ch = PyUnicode_READ(self->kind, self->src_data, self->src_pos); + if (!is_valid(ch)) break; + output[0] = ch; + self->src_pos++; output++; + } +} + +static bool +is_octal_digit(Py_UCS4 ch) { return '0' <= ch && ch <= '7'; } + +static bool +is_hex_digit(Py_UCS4 ch) { return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'f') || ('A' <= ch && ch <= 'F'); } + +static void +write_octal_ch(Shlex *self, Py_UCS4 ch) { + char chars[4] = {ch, 0, 0, 0}; + read_valid_digits(self, 2, chars + 1, is_octal_digit); + write_ch(self, strtol(chars, NULL, 8)); +} + +static bool +write_unicode_ch(Shlex *self, int max) { + char chars[16] = {0}; + read_valid_digits(self, max, chars, is_hex_digit); + if (!chars[0]) { PyErr_SetString(PyExc_ValueError, "Trailing unicode escape at end of input data"); return false; } + write_ch(self, strtol(chars, NULL, 16)); + return true; +} + +static bool +write_ansi_escape_ch(Shlex *self) { + if (self->src_pos >= self->src_sz) { PyErr_SetString(PyExc_ValueError, "Trailing backslash at end of input data"); return false; } + Py_UCS4 ch = read_ch(self); + switch(ch) { + case 'a': write_ch(self, '\a'); return true; + case 'b': write_ch(self, '\b'); return true; + case 'e': case 'E': write_ch(self, 0x1b); return true; + case 'f': write_ch(self, '\f'); return true; + case 'n': write_ch(self, '\n'); return true; + case 'r': write_ch(self, '\r'); return true; + case 't': write_ch(self, '\t'); return true; + case 'v': write_ch(self, '\v'); return true; + case '\\': write_ch(self, '\\'); return true; + case '\'': write_ch(self, '\''); return true; + case '\"': write_ch(self, '\"'); return true; + case '\?': write_ch(self, '\?'); return true; + + case 'c': return write_control_ch(self); + case 'x': return write_unicode_ch(self, 2); + case 'u': return write_unicode_ch(self, 4); + case 'U': return write_unicode_ch(self, 8); +START_ALLOW_CASE_RANGE + case '0' ... '7': write_octal_ch(self, ch); return true; +END_ALLOW_CASE_RANGE + + default: + write_ch(self, ch); return true; + } +} + static void set_state(Shlex *self, State s) { self->state = s; @@ -85,8 +164,9 @@ static PyObject* next_word(Shlex *self, PyObject *args UNUSED) { #define write_escaped_or_fail() if (!write_escape_ch(self)) { PyErr_SetString(PyExc_ValueError, "Trailing backslash at end of input data"); return NULL; } + Py_UCS4 prev_word_ch = 0; while (self->src_pos < self->src_sz) { - Py_UCS4 ch = PyUnicode_READ(self->kind, self->src_data, self->src_pos); self->src_pos++; + Py_UCS4 ch = read_ch(self); switch(self->state) { case NORMAL: switch(ch) { @@ -94,32 +174,35 @@ next_word(Shlex *self, PyObject *args UNUSED) { case STRING_WITH_ESCAPES_DELIM: set_state(self, STRING_WITH_ESCAPES); start_word(self); break; case STRING_WITHOUT_ESCAPES_DELIM: set_state(self, STRING_WITHOUT_ESCAPES); start_word(self); break; case ESCAPE_CHAR: start_word(self); write_escaped_or_fail(); set_state(self, WORD); break; - default: set_state(self, WORD); start_word(self); write_ch(self, ch); break; + default: set_state(self, WORD); start_word(self); write_ch(self, ch); prev_word_ch = ch; break; } break; case WORD: switch(ch) { case WHITESPACE: set_state(self, NORMAL); if (self->buf_pos) return get_word(self); break; case STRING_WITH_ESCAPES_DELIM: set_state(self, STRING_WITH_ESCAPES); break; - case STRING_WITHOUT_ESCAPES_DELIM: set_state(self, STRING_WITHOUT_ESCAPES); break; + case STRING_WITHOUT_ESCAPES_DELIM: + if (self->support_ansi_c_quoting && prev_word_ch == '$') { self->buf_pos--; set_state(self, ANSI_C_QUOTED); } + else set_state(self, STRING_WITHOUT_ESCAPES); + break; case ESCAPE_CHAR: write_escaped_or_fail(); break; - default: write_ch(self, ch); break; + default: write_ch(self, ch); prev_word_ch = ch; break; } break; case STRING_WITHOUT_ESCAPES: switch(ch) { - case STRING_WITHOUT_ESCAPES_DELIM: - set_state(self, WORD); - break; + case STRING_WITHOUT_ESCAPES_DELIM: set_state(self, WORD); break; default: write_ch(self, ch); break; } break; case STRING_WITH_ESCAPES: switch(ch) { - case STRING_WITH_ESCAPES_DELIM: - set_state(self, WORD); - break; - case ESCAPE_CHAR: - write_escape_ch(self); - break; + case STRING_WITH_ESCAPES_DELIM: set_state(self, WORD); break; + case ESCAPE_CHAR: write_escaped_or_fail(); break; + default: write_ch(self, ch); break; + } break; + case ANSI_C_QUOTED: + switch(ch) { + case STRING_WITHOUT_ESCAPES_DELIM: set_state(self, WORD); break; + case ESCAPE_CHAR: if (!write_ansi_escape_ch(self)) return NULL; break; default: write_ch(self, ch); break; } break; } @@ -129,7 +212,7 @@ next_word(Shlex *self, PyObject *args UNUSED) { self->state = NORMAL; if (self->buf_pos) return get_word(self); break; - case STRING_WITH_ESCAPES: case STRING_WITHOUT_ESCAPES: + case STRING_WITH_ESCAPES: case STRING_WITHOUT_ESCAPES: case ANSI_C_QUOTED: PyErr_SetString(PyExc_ValueError, "Unterminated string at the end of input"); self->state = NORMAL; return NULL; diff --git a/kitty/utils.py b/kitty/utils.py index 13169bd93..e5c03e23e 100644 --- a/kitty/utils.py +++ b/kitty/utils.py @@ -1227,14 +1227,14 @@ def key_val_matcher(items: Iterable[Tuple[str, str]], key_pat: 're.Pattern[str]' return False -def shlex_split(text: str) -> Iterator[str]: - s = Shlex(text) +def shlex_split(text: str, allow_ansi_quoted_strings: bool = False) -> Iterator[str]: + s = Shlex(text, allow_ansi_quoted_strings) while (q := s.next_word())[0] > -1: yield q[1] -def shlex_split_with_positions(text: str) -> Iterator[Tuple[int, str]]: - s = Shlex(text) +def shlex_split_with_positions(text: str, allow_ansi_quoted_strings: bool = False) -> Iterator[Tuple[int, str]]: + s = Shlex(text, allow_ansi_quoted_strings) while (q := s.next_word())[0] > -1: yield q diff --git a/kitty_tests/datatypes.py b/kitty_tests/datatypes.py index b20b6452f..dee3e645c 100644 --- a/kitty_tests/datatypes.py +++ b/kitty_tests/datatypes.py @@ -629,7 +629,7 @@ class TestDataTypes(BaseTest): def test_shlex_split(self): for bad in ( - 'abc\\', '\\', "'abc", "'", '"', 'asd' + '\\', + 'abc\\', '\\', "'abc", "'", '"', 'asd' + '\\', r'"a\"', '"a\\', ): with self.assertRaises(ValueError, msg=f'Failed to raise exception for {bad!r}'): tuple(shlex_split_with_positions(bad)) @@ -640,6 +640,24 @@ class TestDataTypes(BaseTest): r'''x'y"\z'1''': ((0, 'xy"\\z1'),), r'\abc\ d': ((0, 'abc d'),), '': (), ' ': (), ' \tabc\n\t\r ': ((2, 'abc'),), + "$'ab'": ((0, '$ab'),), }.items(): actual = tuple(shlex_split_with_positions(q)) self.ae(expected, actual, f'Failed for text: {q!r}') + + for q, expected in { + "$'ab'": ((0, 'ab'),), + "1$'ab'": ((0, '1ab'),), + '''"1$'ab'"''': ((0, "1$'ab'"),), + r"$'a\123b'": ((0, 'a\123b'),), + r"$'a\1b'": ((0, 'a\001b'),), + r"$'a\12b'": ((0, 'a\012b'),), + r"$'a\db'": ((0, 'adb'),), + r"$'a\x1bb'": ((0, 'a\x1bb'),), + r"$'\u123z'": ((0, '\u0123z'),), + r"$'\U0001F1E8'": ((0, '\U0001F1E8'),), + r"$'\U1F1E8'": ((0, '\U0001F1E8'),), + r"$'a\U1F1E8'b": ((0, 'a\U0001F1E8b'),), + }.items(): + actual = tuple(shlex_split_with_positions(q, True)) + self.ae(expected, actual, f'Failed for text: {q!r}')