mirror of
https://github.com/uutils/coreutils.git
synced 2026-05-06 07:26:38 -04:00
Merge pull request #9126 from mattsu2020/fold_fix
fix(fold): GNU fold-characters.sh test
This commit is contained in:
@@ -120,6 +120,7 @@ pseudoprimes
|
||||
quantiles
|
||||
readonly
|
||||
reparse
|
||||
rposition
|
||||
seedable
|
||||
semver
|
||||
semiprime
|
||||
|
||||
Generated
+1
@@ -3495,6 +3495,7 @@ dependencies = [
|
||||
"codspeed-divan-compat",
|
||||
"fluent",
|
||||
"tempfile",
|
||||
"unicode-width 0.2.2",
|
||||
"uucore",
|
||||
]
|
||||
|
||||
|
||||
@@ -21,6 +21,7 @@ path = "src/fold.rs"
|
||||
clap = { workspace = true }
|
||||
uucore = { workspace = true }
|
||||
fluent = { workspace = true }
|
||||
unicode-width = { workspace = true }
|
||||
|
||||
[dev-dependencies]
|
||||
divan = { workspace = true }
|
||||
|
||||
@@ -2,6 +2,7 @@ fold-about = Writes each file (or standard input if no files are given)
|
||||
to standard output whilst breaking long lines
|
||||
fold-usage = fold [OPTION]... [FILE]...
|
||||
fold-bytes-help = count using bytes rather than columns (meaning control characters such as newline are not treated specially)
|
||||
fold-characters-help = count using character positions rather than display columns
|
||||
fold-spaces-help = break lines at word boundaries rather than a hard cut-off
|
||||
fold-width-help = set WIDTH as the maximum line width rather than 80
|
||||
fold-error-illegal-width = illegal width value
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
fold-about = Écrit chaque fichier (ou l'entrée standard si aucun fichier n'est donné) sur la sortie standard en coupant les lignes trop longues
|
||||
fold-usage = fold [OPTION]... [FICHIER]...
|
||||
fold-bytes-help = compter en octets plutôt qu'en colonnes (les caractères de contrôle comme retour chariot ne sont pas traités spécialement)
|
||||
fold-characters-help = compter en caractères plutôt qu'en colonnes d'affichage
|
||||
fold-spaces-help = couper les lignes aux limites de mots plutôt qu'à une largeur fixe
|
||||
fold-width-help = définir WIDTH comme largeur de ligne maximale au lieu de 80
|
||||
fold-error-illegal-width = valeur de largeur illégale
|
||||
|
||||
+355
-67
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout};
|
||||
use std::path::Path;
|
||||
use unicode_width::UnicodeWidthChar;
|
||||
use uucore::display::Quotable;
|
||||
use uucore::error::{FromIo, UResult, USimpleError};
|
||||
use uucore::format_usage;
|
||||
@@ -21,11 +22,28 @@ const TAB: u8 = b'\t';
|
||||
|
||||
mod options {
|
||||
pub const BYTES: &str = "bytes";
|
||||
pub const CHARACTERS: &str = "characters";
|
||||
pub const SPACES: &str = "spaces";
|
||||
pub const WIDTH: &str = "width";
|
||||
pub const FILE: &str = "file";
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, PartialEq, Eq)]
|
||||
enum WidthMode {
|
||||
Columns,
|
||||
Characters,
|
||||
}
|
||||
|
||||
struct FoldContext<'a, W: Write> {
|
||||
spaces: bool,
|
||||
width: usize,
|
||||
mode: WidthMode,
|
||||
writer: &'a mut W,
|
||||
output: &'a mut Vec<u8>,
|
||||
col_count: &'a mut usize,
|
||||
last_space: &'a mut Option<usize>,
|
||||
}
|
||||
|
||||
#[uucore::main]
|
||||
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||
let args = args.collect_lossy();
|
||||
@@ -34,6 +52,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
|
||||
|
||||
let bytes = matches.get_flag(options::BYTES);
|
||||
let characters = matches.get_flag(options::CHARACTERS);
|
||||
let spaces = matches.get_flag(options::SPACES);
|
||||
let poss_width = match matches.get_one::<String>(options::WIDTH) {
|
||||
Some(v) => Some(v.clone()),
|
||||
@@ -55,7 +74,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
|
||||
None => vec!["-".to_owned()],
|
||||
};
|
||||
|
||||
fold(&files, bytes, spaces, width)
|
||||
fold(&files, bytes, characters, spaces, width)
|
||||
}
|
||||
|
||||
pub fn uu_app() -> Command {
|
||||
@@ -72,6 +91,13 @@ pub fn uu_app() -> Command {
|
||||
.help(translate!("fold-bytes-help"))
|
||||
.action(ArgAction::SetTrue),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(options::CHARACTERS)
|
||||
.long(options::CHARACTERS)
|
||||
.help(translate!("fold-characters-help"))
|
||||
.conflicts_with(options::BYTES)
|
||||
.action(ArgAction::SetTrue),
|
||||
)
|
||||
.arg(
|
||||
Arg::new(options::SPACES)
|
||||
.long(options::SPACES)
|
||||
@@ -107,7 +133,13 @@ fn handle_obsolete(args: &[String]) -> (Vec<String>, Option<String>) {
|
||||
(args.to_vec(), None)
|
||||
}
|
||||
|
||||
fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResult<()> {
|
||||
fn fold(
|
||||
filenames: &[String],
|
||||
bytes: bool,
|
||||
characters: bool,
|
||||
spaces: bool,
|
||||
width: usize,
|
||||
) -> UResult<()> {
|
||||
let mut output = BufWriter::new(stdout());
|
||||
|
||||
for filename in filenames {
|
||||
@@ -125,7 +157,12 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul
|
||||
if bytes {
|
||||
fold_file_bytewise(buffer, spaces, width, &mut output)?;
|
||||
} else {
|
||||
fold_file(buffer, spaces, width, &mut output)?;
|
||||
let mode = if characters {
|
||||
WidthMode::Characters
|
||||
} else {
|
||||
WidthMode::Columns
|
||||
};
|
||||
fold_file(buffer, spaces, width, mode, &mut output)?;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -213,6 +250,303 @@ fn fold_file_bytewise<T: Read, W: Write>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn next_tab_stop(col_count: usize) -> usize {
|
||||
col_count + TAB_WIDTH - col_count % TAB_WIDTH
|
||||
}
|
||||
|
||||
fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
|
||||
match mode {
|
||||
WidthMode::Characters => std::str::from_utf8(buffer)
|
||||
.map(|s| s.chars().count())
|
||||
.unwrap_or(buffer.len()),
|
||||
WidthMode::Columns => {
|
||||
if let Ok(s) = std::str::from_utf8(buffer) {
|
||||
let mut width = 0;
|
||||
for ch in s.chars() {
|
||||
match ch {
|
||||
'\r' => width = 0,
|
||||
'\t' => width = next_tab_stop(width),
|
||||
'\x08' => width = width.saturating_sub(1),
|
||||
_ => width += UnicodeWidthChar::width(ch).unwrap_or(0),
|
||||
}
|
||||
}
|
||||
width
|
||||
} else {
|
||||
let mut width = 0;
|
||||
for &byte in buffer {
|
||||
match byte {
|
||||
CR => width = 0,
|
||||
TAB => width = next_tab_stop(width),
|
||||
0x08 => width = width.saturating_sub(1),
|
||||
_ => width += 1,
|
||||
}
|
||||
}
|
||||
width
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
|
||||
let consume = match *ctx.last_space {
|
||||
Some(index) => index + 1,
|
||||
None => ctx.output.len(),
|
||||
};
|
||||
|
||||
if consume > 0 {
|
||||
ctx.writer.write_all(&ctx.output[..consume])?;
|
||||
}
|
||||
ctx.writer.write_all(&[NL])?;
|
||||
|
||||
let last_space = *ctx.last_space;
|
||||
|
||||
if consume < ctx.output.len() {
|
||||
ctx.output.drain(..consume);
|
||||
} else {
|
||||
ctx.output.clear();
|
||||
}
|
||||
|
||||
*ctx.col_count = compute_col_count(ctx.output, ctx.mode);
|
||||
|
||||
if ctx.spaces {
|
||||
*ctx.last_space = last_space.and_then(|idx| {
|
||||
if idx < consume {
|
||||
None
|
||||
} else {
|
||||
Some(idx - consume)
|
||||
}
|
||||
});
|
||||
} else {
|
||||
*ctx.last_space = None;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
|
||||
let mut idx = 0;
|
||||
let len = line.len();
|
||||
|
||||
while idx < len {
|
||||
match line[idx] {
|
||||
NL => {
|
||||
*ctx.last_space = None;
|
||||
emit_output(ctx)?;
|
||||
break;
|
||||
}
|
||||
CR => {
|
||||
ctx.output.push(CR);
|
||||
*ctx.col_count = 0;
|
||||
idx += 1;
|
||||
}
|
||||
0x08 => {
|
||||
ctx.output.push(0x08);
|
||||
*ctx.col_count = ctx.col_count.saturating_sub(1);
|
||||
idx += 1;
|
||||
}
|
||||
TAB if ctx.mode == WidthMode::Columns => {
|
||||
loop {
|
||||
let next_stop = next_tab_stop(*ctx.col_count);
|
||||
if next_stop > ctx.width && !ctx.output.is_empty() {
|
||||
emit_output(ctx)?;
|
||||
continue;
|
||||
}
|
||||
*ctx.col_count = next_stop;
|
||||
break;
|
||||
}
|
||||
if ctx.spaces {
|
||||
*ctx.last_space = Some(ctx.output.len());
|
||||
} else {
|
||||
*ctx.last_space = None;
|
||||
}
|
||||
ctx.output.push(TAB);
|
||||
idx += 1;
|
||||
}
|
||||
0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => {
|
||||
ctx.output.push(line[idx]);
|
||||
if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR {
|
||||
*ctx.last_space = Some(ctx.output.len() - 1);
|
||||
} else if !ctx.spaces {
|
||||
*ctx.last_space = None;
|
||||
}
|
||||
idx += 1;
|
||||
}
|
||||
_ => {
|
||||
let start = idx;
|
||||
while idx < len
|
||||
&& !matches!(
|
||||
line[idx],
|
||||
NL | CR | TAB | 0x08 | 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F
|
||||
)
|
||||
{
|
||||
idx += 1;
|
||||
}
|
||||
push_ascii_segment(&line[start..idx], ctx)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
|
||||
if segment.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mut remaining = segment;
|
||||
|
||||
while !remaining.is_empty() {
|
||||
if *ctx.col_count >= ctx.width {
|
||||
emit_output(ctx)?;
|
||||
continue;
|
||||
}
|
||||
|
||||
let available = ctx.width - *ctx.col_count;
|
||||
let take = remaining.len().min(available);
|
||||
let base_len = ctx.output.len();
|
||||
|
||||
ctx.output.extend_from_slice(&remaining[..take]);
|
||||
*ctx.col_count += take;
|
||||
|
||||
if ctx.spaces {
|
||||
if let Some(pos) = remaining[..take]
|
||||
.iter()
|
||||
.rposition(|b| b.is_ascii_whitespace() && *b != CR)
|
||||
{
|
||||
*ctx.last_space = Some(base_len + pos);
|
||||
}
|
||||
} else {
|
||||
*ctx.last_space = None;
|
||||
}
|
||||
|
||||
remaining = &remaining[take..];
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> {
|
||||
if line.is_ascii() {
|
||||
return process_ascii_line(line.as_bytes(), ctx);
|
||||
}
|
||||
|
||||
let line_bytes = line.as_bytes();
|
||||
let mut iter = line.char_indices().peekable();
|
||||
|
||||
while let Some((byte_idx, ch)) = iter.next() {
|
||||
let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len());
|
||||
|
||||
if ch == '\n' {
|
||||
*ctx.last_space = None;
|
||||
emit_output(ctx)?;
|
||||
break;
|
||||
}
|
||||
|
||||
if *ctx.col_count >= ctx.width {
|
||||
emit_output(ctx)?;
|
||||
}
|
||||
|
||||
if ch == '\r' {
|
||||
ctx.output
|
||||
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
|
||||
*ctx.col_count = 0;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ch == '\x08' {
|
||||
ctx.output
|
||||
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
|
||||
*ctx.col_count = ctx.col_count.saturating_sub(1);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ctx.mode == WidthMode::Columns && ch == '\t' {
|
||||
loop {
|
||||
let next_stop = next_tab_stop(*ctx.col_count);
|
||||
if next_stop > ctx.width && !ctx.output.is_empty() {
|
||||
emit_output(ctx)?;
|
||||
continue;
|
||||
}
|
||||
*ctx.col_count = next_stop;
|
||||
break;
|
||||
}
|
||||
if ctx.spaces {
|
||||
*ctx.last_space = Some(ctx.output.len());
|
||||
} else {
|
||||
*ctx.last_space = None;
|
||||
}
|
||||
ctx.output
|
||||
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
|
||||
continue;
|
||||
}
|
||||
|
||||
let added = match ctx.mode {
|
||||
WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0),
|
||||
WidthMode::Characters => 1,
|
||||
};
|
||||
|
||||
if ctx.mode == WidthMode::Columns
|
||||
&& added > 0
|
||||
&& *ctx.col_count + added > ctx.width
|
||||
&& !ctx.output.is_empty()
|
||||
{
|
||||
emit_output(ctx)?;
|
||||
}
|
||||
|
||||
if ctx.spaces && ch.is_ascii_whitespace() {
|
||||
*ctx.last_space = Some(ctx.output.len());
|
||||
}
|
||||
|
||||
ctx.output
|
||||
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
|
||||
*ctx.col_count = ctx.col_count.saturating_add(added);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
|
||||
for &byte in line {
|
||||
if byte == NL {
|
||||
*ctx.last_space = None;
|
||||
emit_output(ctx)?;
|
||||
break;
|
||||
}
|
||||
|
||||
if *ctx.col_count >= ctx.width {
|
||||
emit_output(ctx)?;
|
||||
}
|
||||
|
||||
match byte {
|
||||
CR => *ctx.col_count = 0,
|
||||
TAB => {
|
||||
let next_stop = next_tab_stop(*ctx.col_count);
|
||||
if next_stop > ctx.width && !ctx.output.is_empty() {
|
||||
emit_output(ctx)?;
|
||||
}
|
||||
*ctx.col_count = next_stop;
|
||||
*ctx.last_space = if ctx.spaces {
|
||||
Some(ctx.output.len())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
ctx.output.push(byte);
|
||||
continue;
|
||||
}
|
||||
0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1),
|
||||
_ if ctx.spaces && byte.is_ascii_whitespace() => {
|
||||
*ctx.last_space = Some(ctx.output.len());
|
||||
*ctx.col_count = ctx.col_count.saturating_add(1);
|
||||
}
|
||||
_ => *ctx.col_count = ctx.col_count.saturating_add(1),
|
||||
}
|
||||
|
||||
ctx.output.push(byte);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Fold `file` to fit `width` (number of columns).
|
||||
///
|
||||
/// By default `fold` treats tab, backspace, and carriage return specially:
|
||||
@@ -226,6 +560,7 @@ fn fold_file<T: Read, W: Write>(
|
||||
mut file: BufReader<T>,
|
||||
spaces: bool,
|
||||
width: usize,
|
||||
mode: WidthMode,
|
||||
writer: &mut W,
|
||||
) -> UResult<()> {
|
||||
let mut line = Vec::new();
|
||||
@@ -233,30 +568,6 @@ fn fold_file<T: Read, W: Write>(
|
||||
let mut col_count = 0;
|
||||
let mut last_space = None;
|
||||
|
||||
/// Print the output line, resetting the column and character counts.
|
||||
///
|
||||
/// If `spaces` is `true`, print the output line up to the last
|
||||
/// encountered whitespace character (inclusive) and set the remaining
|
||||
/// characters as the start of the next line.
|
||||
macro_rules! emit_output {
|
||||
() => {
|
||||
let consume = match last_space {
|
||||
Some(i) => i + 1,
|
||||
None => output.len(),
|
||||
};
|
||||
|
||||
writer.write_all(&output[..consume])?;
|
||||
writer.write_all(&[NL])?;
|
||||
output.drain(..consume);
|
||||
|
||||
// we know there are no tabs left in output, so each char counts
|
||||
// as 1 column
|
||||
col_count = output.len();
|
||||
|
||||
last_space = None;
|
||||
};
|
||||
}
|
||||
|
||||
loop {
|
||||
if file
|
||||
.read_until(NL, &mut line)
|
||||
@@ -266,50 +577,27 @@ fn fold_file<T: Read, W: Write>(
|
||||
break;
|
||||
}
|
||||
|
||||
for ch in &line {
|
||||
if *ch == NL {
|
||||
// make sure to _not_ split output at whitespace, since we
|
||||
// know the entire output will fit
|
||||
last_space = None;
|
||||
emit_output!();
|
||||
break;
|
||||
}
|
||||
let mut ctx = FoldContext {
|
||||
spaces,
|
||||
width,
|
||||
mode,
|
||||
writer,
|
||||
output: &mut output,
|
||||
col_count: &mut col_count,
|
||||
last_space: &mut last_space,
|
||||
};
|
||||
|
||||
if col_count >= width {
|
||||
emit_output!();
|
||||
}
|
||||
|
||||
match *ch {
|
||||
CR => col_count = 0,
|
||||
TAB => {
|
||||
let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH;
|
||||
|
||||
if next_tab_stop > width && !output.is_empty() {
|
||||
emit_output!();
|
||||
}
|
||||
|
||||
col_count = next_tab_stop;
|
||||
last_space = if spaces { Some(output.len()) } else { None };
|
||||
}
|
||||
0x08 => {
|
||||
col_count = col_count.saturating_sub(1);
|
||||
}
|
||||
_ if spaces && ch.is_ascii_whitespace() => {
|
||||
last_space = Some(output.len());
|
||||
col_count += 1;
|
||||
}
|
||||
_ => col_count += 1,
|
||||
}
|
||||
|
||||
output.push(*ch);
|
||||
match std::str::from_utf8(&line) {
|
||||
Ok(s) => process_utf8_line(s, &mut ctx)?,
|
||||
Err(_) => process_non_utf8_line(&line, &mut ctx)?,
|
||||
}
|
||||
|
||||
if !output.is_empty() {
|
||||
writer.write_all(&output)?;
|
||||
output.truncate(0);
|
||||
}
|
||||
line.clear();
|
||||
}
|
||||
|
||||
line.truncate(0);
|
||||
if !output.is_empty() {
|
||||
writer.write_all(&output)?;
|
||||
output.clear();
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -41,6 +41,24 @@ fn test_default_wrap_with_newlines() {
|
||||
.stdout_is_fixture("lorem_ipsum_new_line_80_column.expected");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wide_characters_in_column_mode() {
|
||||
new_ucmd!()
|
||||
.args(&["-w", "5"])
|
||||
.pipe_in("\u{B250}\u{B250}\u{B250}\n")
|
||||
.succeeds()
|
||||
.stdout_is("\u{B250}\u{B250}\n\u{B250}\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wide_characters_with_characters_option() {
|
||||
new_ucmd!()
|
||||
.args(&["--characters", "-w", "5"])
|
||||
.pipe_in("\u{B250}\u{B250}\u{B250}\n")
|
||||
.succeeds()
|
||||
.stdout_is("\u{B250}\u{B250}\u{B250}\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_preserve_empty_line_without_final_newline() {
|
||||
new_ucmd!()
|
||||
|
||||
Reference in New Issue
Block a user