Merge pull request #9126 from mattsu2020/fold_fix

fix(fold): GNU fold-characters.sh test
2026-05-06 07:26:38 -04:00 · 2025-11-13 19:41:39 +09:00
parent 5bb907b348
commit 4a48c9e35e
7 changed files with 378 additions and 67 deletions
@@ -120,6 +120,7 @@ pseudoprimes
 quantiles
 readonly
 reparse
+rposition
 seedable
 semver
 semiprime
@@ -3495,6 +3495,7 @@ dependencies = [
 "codspeed-divan-compat",
 "fluent",
 "tempfile",
+ "unicode-width 0.2.2",
 "uucore",
 ]

@@ -21,6 +21,7 @@ path = "src/fold.rs"
 clap = { workspace = true }
 uucore = { workspace = true }
 fluent = { workspace = true }
+unicode-width = { workspace = true }

 [dev-dependencies]
 divan = { workspace = true }
@@ -2,6 +2,7 @@ fold-about = Writes each file (or standard input if no files are given)
  to standard output whilst breaking long lines
 fold-usage = fold [OPTION]... [FILE]...
 fold-bytes-help = count using bytes rather than columns (meaning control characters such as newline are not treated specially)
+fold-characters-help = count using character positions rather than display columns
 fold-spaces-help = break lines at word boundaries rather than a hard cut-off
 fold-width-help = set WIDTH as the maximum line width rather than 80
 fold-error-illegal-width = illegal width value
@@ -1,6 +1,7 @@
 fold-about = Écrit chaque fichier (ou l'entrée standard si aucun fichier n'est donné) sur la sortie standard en coupant les lignes trop longues
 fold-usage = fold [OPTION]... [FICHIER]...
 fold-bytes-help = compter en octets plutôt qu'en colonnes (les caractères de contrôle comme retour chariot ne sont pas traités spécialement)
+fold-characters-help = compter en caractères plutôt qu'en colonnes d'affichage
 fold-spaces-help = couper les lignes aux limites de mots plutôt qu'à une largeur fixe
 fold-width-help = définir WIDTH comme largeur de ligne maximale au lieu de 80
 fold-error-illegal-width = valeur de largeur illégale
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
 use std::fs::File;
 use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout};
 use std::path::Path;
+use unicode_width::UnicodeWidthChar;
 use uucore::display::Quotable;
 use uucore::error::{FromIo, UResult, USimpleError};
 use uucore::format_usage;
@@ -21,11 +22,28 @@ const TAB: u8 = b'\t';

 mod options {
    pub const BYTES: &str = "bytes";
+    pub const CHARACTERS: &str = "characters";
    pub const SPACES: &str = "spaces";
    pub const WIDTH: &str = "width";
    pub const FILE: &str = "file";
 }

+#[derive(Clone, Copy, PartialEq, Eq)]
+enum WidthMode {
+    Columns,
+    Characters,
+}
+
+struct FoldContext<'a, W: Write> {
+    spaces: bool,
+    width: usize,
+    mode: WidthMode,
+    writer: &'a mut W,
+    output: &'a mut Vec<u8>,
+    col_count: &'a mut usize,
+    last_space: &'a mut Option<usize>,
+}
+
 #[uucore::main]
 pub fn uumain(args: impl uucore::Args) -> UResult<()> {
    let args = args.collect_lossy();
@@ -34,6 +52,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
    let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;

    let bytes = matches.get_flag(options::BYTES);
+    let characters = matches.get_flag(options::CHARACTERS);
    let spaces = matches.get_flag(options::SPACES);
    let poss_width = match matches.get_one::<String>(options::WIDTH) {
        Some(v) => Some(v.clone()),
@@ -55,7 +74,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
        None => vec!["-".to_owned()],
    };

-    fold(&files, bytes, spaces, width)
+    fold(&files, bytes, characters, spaces, width)
 }

 pub fn uu_app() -> Command {
@@ -72,6 +91,13 @@ pub fn uu_app() -> Command {
                .help(translate!("fold-bytes-help"))
                .action(ArgAction::SetTrue),
        )
+        .arg(
+            Arg::new(options::CHARACTERS)
+                .long(options::CHARACTERS)
+                .help(translate!("fold-characters-help"))
+                .conflicts_with(options::BYTES)
+                .action(ArgAction::SetTrue),
+        )
        .arg(
            Arg::new(options::SPACES)
                .long(options::SPACES)
@@ -107,7 +133,13 @@ fn handle_obsolete(args: &[String]) -> (Vec<String>, Option<String>) {
    (args.to_vec(), None)
 }

-fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResult<()> {
+fn fold(
+    filenames: &[String],
+    bytes: bool,
+    characters: bool,
+    spaces: bool,
+    width: usize,
+) -> UResult<()> {
    let mut output = BufWriter::new(stdout());

    for filename in filenames {
@@ -125,7 +157,12 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul
        if bytes {
            fold_file_bytewise(buffer, spaces, width, &mut output)?;
        } else {
-            fold_file(buffer, spaces, width, &mut output)?;
+            let mode = if characters {
+                WidthMode::Characters
+            } else {
+                WidthMode::Columns
+            };
+            fold_file(buffer, spaces, width, mode, &mut output)?;
        }
    }

@@ -213,6 +250,303 @@ fn fold_file_bytewise<T: Read, W: Write>(
    Ok(())
 }

+fn next_tab_stop(col_count: usize) -> usize {
+    col_count + TAB_WIDTH - col_count % TAB_WIDTH
+}
+
+fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
+    match mode {
+        WidthMode::Characters => std::str::from_utf8(buffer)
+            .map(|s| s.chars().count())
+            .unwrap_or(buffer.len()),
+        WidthMode::Columns => {
+            if let Ok(s) = std::str::from_utf8(buffer) {
+                let mut width = 0;
+                for ch in s.chars() {
+                    match ch {
+                        '\r' => width = 0,
+                        '\t' => width = next_tab_stop(width),
+                        '\x08' => width = width.saturating_sub(1),
+                        _ => width += UnicodeWidthChar::width(ch).unwrap_or(0),
+                    }
+                }
+                width
+            } else {
+                let mut width = 0;
+                for &byte in buffer {
+                    match byte {
+                        CR => width = 0,
+                        TAB => width = next_tab_stop(width),
+                        0x08 => width = width.saturating_sub(1),
+                        _ => width += 1,
+                    }
+                }
+                width
+            }
+        }
+    }
+}
+
+fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
+    let consume = match *ctx.last_space {
+        Some(index) => index + 1,
+        None => ctx.output.len(),
+    };
+
+    if consume > 0 {
+        ctx.writer.write_all(&ctx.output[..consume])?;
+    }
+    ctx.writer.write_all(&[NL])?;
+
+    let last_space = *ctx.last_space;
+
+    if consume < ctx.output.len() {
+        ctx.output.drain(..consume);
+    } else {
+        ctx.output.clear();
+    }
+
+    *ctx.col_count = compute_col_count(ctx.output, ctx.mode);
+
+    if ctx.spaces {
+        *ctx.last_space = last_space.and_then(|idx| {
+            if idx < consume {
+                None
+            } else {
+                Some(idx - consume)
+            }
+        });
+    } else {
+        *ctx.last_space = None;
+    }
+    Ok(())
+}
+
+fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
+    let mut idx = 0;
+    let len = line.len();
+
+    while idx < len {
+        match line[idx] {
+            NL => {
+                *ctx.last_space = None;
+                emit_output(ctx)?;
+                break;
+            }
+            CR => {
+                ctx.output.push(CR);
+                *ctx.col_count = 0;
+                idx += 1;
+            }
+            0x08 => {
+                ctx.output.push(0x08);
+                *ctx.col_count = ctx.col_count.saturating_sub(1);
+                idx += 1;
+            }
+            TAB if ctx.mode == WidthMode::Columns => {
+                loop {
+                    let next_stop = next_tab_stop(*ctx.col_count);
+                    if next_stop > ctx.width && !ctx.output.is_empty() {
+                        emit_output(ctx)?;
+                        continue;
+                    }
+                    *ctx.col_count = next_stop;
+                    break;
+                }
+                if ctx.spaces {
+                    *ctx.last_space = Some(ctx.output.len());
+                } else {
+                    *ctx.last_space = None;
+                }
+                ctx.output.push(TAB);
+                idx += 1;
+            }
+            0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => {
+                ctx.output.push(line[idx]);
+                if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR {
+                    *ctx.last_space = Some(ctx.output.len() - 1);
+                } else if !ctx.spaces {
+                    *ctx.last_space = None;
+                }
+                idx += 1;
+            }
+            _ => {
+                let start = idx;
+                while idx < len
+                    && !matches!(
+                        line[idx],
+                        NL | CR | TAB | 0x08 | 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F
+                    )
+                {
+                    idx += 1;
+                }
+                push_ascii_segment(&line[start..idx], ctx)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
+    if segment.is_empty() {
+        return Ok(());
+    }
+
+    let mut remaining = segment;
+
+    while !remaining.is_empty() {
+        if *ctx.col_count >= ctx.width {
+            emit_output(ctx)?;
+            continue;
+        }
+
+        let available = ctx.width - *ctx.col_count;
+        let take = remaining.len().min(available);
+        let base_len = ctx.output.len();
+
+        ctx.output.extend_from_slice(&remaining[..take]);
+        *ctx.col_count += take;
+
+        if ctx.spaces {
+            if let Some(pos) = remaining[..take]
+                .iter()
+                .rposition(|b| b.is_ascii_whitespace() && *b != CR)
+            {
+                *ctx.last_space = Some(base_len + pos);
+            }
+        } else {
+            *ctx.last_space = None;
+        }
+
+        remaining = &remaining[take..];
+    }
+
+    Ok(())
+}
+
+fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> {
+    if line.is_ascii() {
+        return process_ascii_line(line.as_bytes(), ctx);
+    }
+
+    let line_bytes = line.as_bytes();
+    let mut iter = line.char_indices().peekable();
+
+    while let Some((byte_idx, ch)) = iter.next() {
+        let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len());
+
+        if ch == '\n' {
+            *ctx.last_space = None;
+            emit_output(ctx)?;
+            break;
+        }
+
+        if *ctx.col_count >= ctx.width {
+            emit_output(ctx)?;
+        }
+
+        if ch == '\r' {
+            ctx.output
+                .extend_from_slice(&line_bytes[byte_idx..next_idx]);
+            *ctx.col_count = 0;
+            continue;
+        }
+
+        if ch == '\x08' {
+            ctx.output
+                .extend_from_slice(&line_bytes[byte_idx..next_idx]);
+            *ctx.col_count = ctx.col_count.saturating_sub(1);
+            continue;
+        }
+
+        if ctx.mode == WidthMode::Columns && ch == '\t' {
+            loop {
+                let next_stop = next_tab_stop(*ctx.col_count);
+                if next_stop > ctx.width && !ctx.output.is_empty() {
+                    emit_output(ctx)?;
+                    continue;
+                }
+                *ctx.col_count = next_stop;
+                break;
+            }
+            if ctx.spaces {
+                *ctx.last_space = Some(ctx.output.len());
+            } else {
+                *ctx.last_space = None;
+            }
+            ctx.output
+                .extend_from_slice(&line_bytes[byte_idx..next_idx]);
+            continue;
+        }
+
+        let added = match ctx.mode {
+            WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0),
+            WidthMode::Characters => 1,
+        };
+
+        if ctx.mode == WidthMode::Columns
+            && added > 0
+            && *ctx.col_count + added > ctx.width
+            && !ctx.output.is_empty()
+        {
+            emit_output(ctx)?;
+        }
+
+        if ctx.spaces && ch.is_ascii_whitespace() {
+            *ctx.last_space = Some(ctx.output.len());
+        }
+
+        ctx.output
+            .extend_from_slice(&line_bytes[byte_idx..next_idx]);
+        *ctx.col_count = ctx.col_count.saturating_add(added);
+    }
+
+    Ok(())
+}
+
+fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
+    for &byte in line {
+        if byte == NL {
+            *ctx.last_space = None;
+            emit_output(ctx)?;
+            break;
+        }
+
+        if *ctx.col_count >= ctx.width {
+            emit_output(ctx)?;
+        }
+
+        match byte {
+            CR => *ctx.col_count = 0,
+            TAB => {
+                let next_stop = next_tab_stop(*ctx.col_count);
+                if next_stop > ctx.width && !ctx.output.is_empty() {
+                    emit_output(ctx)?;
+                }
+                *ctx.col_count = next_stop;
+                *ctx.last_space = if ctx.spaces {
+                    Some(ctx.output.len())
+                } else {
+                    None
+                };
+                ctx.output.push(byte);
+                continue;
+            }
+            0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1),
+            _ if ctx.spaces && byte.is_ascii_whitespace() => {
+                *ctx.last_space = Some(ctx.output.len());
+                *ctx.col_count = ctx.col_count.saturating_add(1);
+            }
+            _ => *ctx.col_count = ctx.col_count.saturating_add(1),
+        }
+
+        ctx.output.push(byte);
+    }
+
+    Ok(())
+}
+
 /// Fold `file` to fit `width` (number of columns).
 ///
 /// By default `fold` treats tab, backspace, and carriage return specially:
@@ -226,6 +560,7 @@ fn fold_file<T: Read, W: Write>(
    mut file: BufReader<T>,
    spaces: bool,
    width: usize,
+    mode: WidthMode,
    writer: &mut W,
 ) -> UResult<()> {
    let mut line = Vec::new();
@@ -233,30 +568,6 @@ fn fold_file<T: Read, W: Write>(
    let mut col_count = 0;
    let mut last_space = None;

-    /// Print the output line, resetting the column and character counts.
-    ///
-    /// If `spaces` is `true`, print the output line up to the last
-    /// encountered whitespace character (inclusive) and set the remaining
-    /// characters as the start of the next line.
-    macro_rules! emit_output {
-        () => {
-            let consume = match last_space {
-                Some(i) => i + 1,
-                None => output.len(),
-            };
-
-            writer.write_all(&output[..consume])?;
-            writer.write_all(&[NL])?;
-            output.drain(..consume);
-
-            // we know there are no tabs left in output, so each char counts
-            // as 1 column
-            col_count = output.len();
-
-            last_space = None;
-        };
-    }
-
    loop {
        if file
            .read_until(NL, &mut line)
@@ -266,50 +577,27 @@ fn fold_file<T: Read, W: Write>(
            break;
        }

-        for ch in &line {
-            if *ch == NL {
-                // make sure to _not_ split output at whitespace, since we
-                // know the entire output will fit
-                last_space = None;
-                emit_output!();
-                break;
-            }
+        let mut ctx = FoldContext {
+            spaces,
+            width,
+            mode,
+            writer,
+            output: &mut output,
+            col_count: &mut col_count,
+            last_space: &mut last_space,
+        };

-            if col_count >= width {
-                emit_output!();
-            }
-
-            match *ch {
-                CR => col_count = 0,
-                TAB => {
-                    let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH;
-
-                    if next_tab_stop > width && !output.is_empty() {
-                        emit_output!();
-                    }
-
-                    col_count = next_tab_stop;
-                    last_space = if spaces { Some(output.len()) } else { None };
-                }
-                0x08 => {
-                    col_count = col_count.saturating_sub(1);
-                }
-                _ if spaces && ch.is_ascii_whitespace() => {
-                    last_space = Some(output.len());
-                    col_count += 1;
-                }
-                _ => col_count += 1,
-            }
-
-            output.push(*ch);
+        match std::str::from_utf8(&line) {
+            Ok(s) => process_utf8_line(s, &mut ctx)?,
+            Err(_) => process_non_utf8_line(&line, &mut ctx)?,
        }

-        if !output.is_empty() {
-            writer.write_all(&output)?;
-            output.truncate(0);
-        }
+        line.clear();
+    }

-        line.truncate(0);
+    if !output.is_empty() {
+        writer.write_all(&output)?;
+        output.clear();
    }

    Ok(())
@@ -41,6 +41,24 @@ fn test_default_wrap_with_newlines() {
        .stdout_is_fixture("lorem_ipsum_new_line_80_column.expected");
 }

+#[test]
+fn test_wide_characters_in_column_mode() {
+    new_ucmd!()
+        .args(&["-w", "5"])
+        .pipe_in("\u{B250}\u{B250}\u{B250}\n")
+        .succeeds()
+        .stdout_is("\u{B250}\u{B250}\n\u{B250}\n");
+}
+
+#[test]
+fn test_wide_characters_with_characters_option() {
+    new_ucmd!()
+        .args(&["--characters", "-w", "5"])
+        .pipe_in("\u{B250}\u{B250}\u{B250}\n")
+        .succeeds()
+        .stdout_is("\u{B250}\u{B250}\u{B250}\n");
+}
+
 #[test]
 fn test_should_preserve_empty_line_without_final_newline() {
    new_ucmd!()