cut: -f: fix handling of multi-byte delimiters that span buffers

* src/cut.c (cut_fields_bytesearch): Ensure up to delim_bytes -1
is left for the next refill.
* tests/cut/cut.pl: Add a test case.
This commit is contained in:
Pádraig Brady
2026-04-02 21:56:23 +01:00
parent 57c87043f6
commit 1a44a25808
2 changed files with 28 additions and 0 deletions
+25
View File
@@ -628,6 +628,25 @@ find_field_delim (char *buf, size_t len)
#endif
}
/* Return the number of trailing bytes in BUF that could be the initial
bytes of a delimiter split across buffers. */
ATTRIBUTE_PURE
static idx_t
field_delim_overlap (char const *buf, idx_t len)
{
idx_t overlap = MIN (len, delim_length - 1);
while (0 < overlap)
{
if (memcmp (buf + len - overlap, delim_bytes, overlap) == 0)
return overlap;
overlap--;
}
return 0;
}
/* Byte search for line end or delimiter in BUF,
returning results in CTX. */
@@ -1142,6 +1161,12 @@ cut_fields_bytesearch (FILE *stream)
idx_t field_len = terminator ? terminator - (chunk + processed)
: n_avail - processed;
if (terminator_kind == FIELD_DATA
&& !search.at_eof
&& !whitespace_delimited
&& !field_delim_is_line_delim ())
field_len -= field_delim_overlap (chunk + processed, field_len);
if (field_len || terminator)
have_pending_line = true;
+3
View File
@@ -345,6 +345,9 @@ if ($mb_locale ne 'C')
['mb-delim-8', '-d', "\xff", '-f2', # Note 0xF5-0xFF is efficient
{IN=>"a\xffb\n"}, {OUT=>"b\n"},
{ENV => "LC_ALL=$mb_locale"}],
['mb-delim-9', '-d', "\xc3\xa9", '-f2',
{IN=>('a' x ($IO_BUFSIZE - 1)) . "\xc3\xa9b\n"}, {OUT=>"b\n"},
{ENV => "LC_ALL=$mb_locale"}],
['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"},
{ENV => "LC_ALL=$mb_locale"}],
['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},