mirror of
git://git.sv.gnu.org/coreutils
synced 2026-05-06 15:16:37 -04:00
cut: -f: fix handling of multi-byte delimiters that span buffers
* src/cut.c (cut_fields_bytesearch): Ensure up to delim_bytes -1 is left for the next refill. * tests/cut/cut.pl: Add a test case.
This commit is contained in:
@@ -628,6 +628,25 @@ find_field_delim (char *buf, size_t len)
|
||||
#endif
|
||||
}
|
||||
|
||||
/* Return the number of trailing bytes in BUF that could be the initial
|
||||
bytes of a delimiter split across buffers. */
|
||||
|
||||
ATTRIBUTE_PURE
|
||||
static idx_t
|
||||
field_delim_overlap (char const *buf, idx_t len)
|
||||
{
|
||||
idx_t overlap = MIN (len, delim_length - 1);
|
||||
|
||||
while (0 < overlap)
|
||||
{
|
||||
if (memcmp (buf + len - overlap, delim_bytes, overlap) == 0)
|
||||
return overlap;
|
||||
overlap--;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Byte search for line end or delimiter in BUF,
|
||||
returning results in CTX. */
|
||||
|
||||
@@ -1142,6 +1161,12 @@ cut_fields_bytesearch (FILE *stream)
|
||||
idx_t field_len = terminator ? terminator - (chunk + processed)
|
||||
: n_avail - processed;
|
||||
|
||||
if (terminator_kind == FIELD_DATA
|
||||
&& !search.at_eof
|
||||
&& !whitespace_delimited
|
||||
&& !field_delim_is_line_delim ())
|
||||
field_len -= field_delim_overlap (chunk + processed, field_len);
|
||||
|
||||
if (field_len || terminator)
|
||||
have_pending_line = true;
|
||||
|
||||
|
||||
@@ -345,6 +345,9 @@ if ($mb_locale ne 'C')
|
||||
['mb-delim-8', '-d', "\xff", '-f2', # Note 0xF5-0xFF is efficient
|
||||
{IN=>"a\xffb\n"}, {OUT=>"b\n"},
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-delim-9', '-d', "\xc3\xa9", '-f2',
|
||||
{IN=>('a' x ($IO_BUFSIZE - 1)) . "\xc3\xa9b\n"}, {OUT=>"b\n"},
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-w-delim-1', '-w', '-f2', {IN=>"a\xe2\x80\x83b\n"}, {OUT=>"b\n"},
|
||||
{ENV => "LC_ALL=$mb_locale"}],
|
||||
['mb-w-delim-2', '-sw', '-f2', {IN=>"a\xc2\xa0b\n"}, {OUT=>""},
|
||||
|
||||
Reference in New Issue
Block a user