Merge pull request #9126 from mattsu2020/fold_fix

fix(fold): GNU fold-characters.sh test
This commit is contained in:
mattsu
2025-11-13 19:41:39 +09:00
committed by GitHub
parent 5bb907b348
commit 4a48c9e35e
7 changed files with 378 additions and 67 deletions
+1
View File
@@ -120,6 +120,7 @@ pseudoprimes
quantiles
readonly
reparse
rposition
seedable
semver
semiprime
Generated
+1
View File
@@ -3495,6 +3495,7 @@ dependencies = [
"codspeed-divan-compat",
"fluent",
"tempfile",
"unicode-width 0.2.2",
"uucore",
]
+1
View File
@@ -21,6 +21,7 @@ path = "src/fold.rs"
clap = { workspace = true }
uucore = { workspace = true }
fluent = { workspace = true }
unicode-width = { workspace = true }
[dev-dependencies]
divan = { workspace = true }
+1
View File
@@ -2,6 +2,7 @@ fold-about = Writes each file (or standard input if no files are given)
to standard output whilst breaking long lines
fold-usage = fold [OPTION]... [FILE]...
fold-bytes-help = count using bytes rather than columns (meaning control characters such as newline are not treated specially)
fold-characters-help = count using character positions rather than display columns
fold-spaces-help = break lines at word boundaries rather than a hard cut-off
fold-width-help = set WIDTH as the maximum line width rather than 80
fold-error-illegal-width = illegal width value
+1
View File
@@ -1,6 +1,7 @@
fold-about = Écrit chaque fichier (ou l'entrée standard si aucun fichier n'est donné) sur la sortie standard en coupant les lignes trop longues
fold-usage = fold [OPTION]... [FICHIER]...
fold-bytes-help = compter en octets plutôt qu'en colonnes (les caractères de contrôle comme retour chariot ne sont pas traités spécialement)
fold-characters-help = compter en caractères plutôt qu'en colonnes d'affichage
fold-spaces-help = couper les lignes aux limites de mots plutôt qu'à une largeur fixe
fold-width-help = définir WIDTH comme largeur de ligne maximale au lieu de 80
fold-error-illegal-width = valeur de largeur illégale
+355 -67
View File
@@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Command};
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Read, Write, stdin, stdout};
use std::path::Path;
use unicode_width::UnicodeWidthChar;
use uucore::display::Quotable;
use uucore::error::{FromIo, UResult, USimpleError};
use uucore::format_usage;
@@ -21,11 +22,28 @@ const TAB: u8 = b'\t';
mod options {
pub const BYTES: &str = "bytes";
pub const CHARACTERS: &str = "characters";
pub const SPACES: &str = "spaces";
pub const WIDTH: &str = "width";
pub const FILE: &str = "file";
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum WidthMode {
Columns,
Characters,
}
struct FoldContext<'a, W: Write> {
spaces: bool,
width: usize,
mode: WidthMode,
writer: &'a mut W,
output: &'a mut Vec<u8>,
col_count: &'a mut usize,
last_space: &'a mut Option<usize>,
}
#[uucore::main]
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let args = args.collect_lossy();
@@ -34,6 +52,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
let matches = uucore::clap_localization::handle_clap_result(uu_app(), args)?;
let bytes = matches.get_flag(options::BYTES);
let characters = matches.get_flag(options::CHARACTERS);
let spaces = matches.get_flag(options::SPACES);
let poss_width = match matches.get_one::<String>(options::WIDTH) {
Some(v) => Some(v.clone()),
@@ -55,7 +74,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
None => vec!["-".to_owned()],
};
fold(&files, bytes, spaces, width)
fold(&files, bytes, characters, spaces, width)
}
pub fn uu_app() -> Command {
@@ -72,6 +91,13 @@ pub fn uu_app() -> Command {
.help(translate!("fold-bytes-help"))
.action(ArgAction::SetTrue),
)
.arg(
Arg::new(options::CHARACTERS)
.long(options::CHARACTERS)
.help(translate!("fold-characters-help"))
.conflicts_with(options::BYTES)
.action(ArgAction::SetTrue),
)
.arg(
Arg::new(options::SPACES)
.long(options::SPACES)
@@ -107,7 +133,13 @@ fn handle_obsolete(args: &[String]) -> (Vec<String>, Option<String>) {
(args.to_vec(), None)
}
fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResult<()> {
fn fold(
filenames: &[String],
bytes: bool,
characters: bool,
spaces: bool,
width: usize,
) -> UResult<()> {
let mut output = BufWriter::new(stdout());
for filename in filenames {
@@ -125,7 +157,12 @@ fn fold(filenames: &[String], bytes: bool, spaces: bool, width: usize) -> UResul
if bytes {
fold_file_bytewise(buffer, spaces, width, &mut output)?;
} else {
fold_file(buffer, spaces, width, &mut output)?;
let mode = if characters {
WidthMode::Characters
} else {
WidthMode::Columns
};
fold_file(buffer, spaces, width, mode, &mut output)?;
}
}
@@ -213,6 +250,303 @@ fn fold_file_bytewise<T: Read, W: Write>(
Ok(())
}
fn next_tab_stop(col_count: usize) -> usize {
col_count + TAB_WIDTH - col_count % TAB_WIDTH
}
fn compute_col_count(buffer: &[u8], mode: WidthMode) -> usize {
match mode {
WidthMode::Characters => std::str::from_utf8(buffer)
.map(|s| s.chars().count())
.unwrap_or(buffer.len()),
WidthMode::Columns => {
if let Ok(s) = std::str::from_utf8(buffer) {
let mut width = 0;
for ch in s.chars() {
match ch {
'\r' => width = 0,
'\t' => width = next_tab_stop(width),
'\x08' => width = width.saturating_sub(1),
_ => width += UnicodeWidthChar::width(ch).unwrap_or(0),
}
}
width
} else {
let mut width = 0;
for &byte in buffer {
match byte {
CR => width = 0,
TAB => width = next_tab_stop(width),
0x08 => width = width.saturating_sub(1),
_ => width += 1,
}
}
width
}
}
}
}
fn emit_output<W: Write>(ctx: &mut FoldContext<'_, W>) -> UResult<()> {
let consume = match *ctx.last_space {
Some(index) => index + 1,
None => ctx.output.len(),
};
if consume > 0 {
ctx.writer.write_all(&ctx.output[..consume])?;
}
ctx.writer.write_all(&[NL])?;
let last_space = *ctx.last_space;
if consume < ctx.output.len() {
ctx.output.drain(..consume);
} else {
ctx.output.clear();
}
*ctx.col_count = compute_col_count(ctx.output, ctx.mode);
if ctx.spaces {
*ctx.last_space = last_space.and_then(|idx| {
if idx < consume {
None
} else {
Some(idx - consume)
}
});
} else {
*ctx.last_space = None;
}
Ok(())
}
fn process_ascii_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
let mut idx = 0;
let len = line.len();
while idx < len {
match line[idx] {
NL => {
*ctx.last_space = None;
emit_output(ctx)?;
break;
}
CR => {
ctx.output.push(CR);
*ctx.col_count = 0;
idx += 1;
}
0x08 => {
ctx.output.push(0x08);
*ctx.col_count = ctx.col_count.saturating_sub(1);
idx += 1;
}
TAB if ctx.mode == WidthMode::Columns => {
loop {
let next_stop = next_tab_stop(*ctx.col_count);
if next_stop > ctx.width && !ctx.output.is_empty() {
emit_output(ctx)?;
continue;
}
*ctx.col_count = next_stop;
break;
}
if ctx.spaces {
*ctx.last_space = Some(ctx.output.len());
} else {
*ctx.last_space = None;
}
ctx.output.push(TAB);
idx += 1;
}
0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F => {
ctx.output.push(line[idx]);
if ctx.spaces && line[idx].is_ascii_whitespace() && line[idx] != CR {
*ctx.last_space = Some(ctx.output.len() - 1);
} else if !ctx.spaces {
*ctx.last_space = None;
}
idx += 1;
}
_ => {
let start = idx;
while idx < len
&& !matches!(
line[idx],
NL | CR | TAB | 0x08 | 0x00..=0x07 | 0x0B..=0x0C | 0x0E..=0x1F | 0x7F
)
{
idx += 1;
}
push_ascii_segment(&line[start..idx], ctx)?;
}
}
}
Ok(())
}
fn push_ascii_segment<W: Write>(segment: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
if segment.is_empty() {
return Ok(());
}
let mut remaining = segment;
while !remaining.is_empty() {
if *ctx.col_count >= ctx.width {
emit_output(ctx)?;
continue;
}
let available = ctx.width - *ctx.col_count;
let take = remaining.len().min(available);
let base_len = ctx.output.len();
ctx.output.extend_from_slice(&remaining[..take]);
*ctx.col_count += take;
if ctx.spaces {
if let Some(pos) = remaining[..take]
.iter()
.rposition(|b| b.is_ascii_whitespace() && *b != CR)
{
*ctx.last_space = Some(base_len + pos);
}
} else {
*ctx.last_space = None;
}
remaining = &remaining[take..];
}
Ok(())
}
fn process_utf8_line<W: Write>(line: &str, ctx: &mut FoldContext<'_, W>) -> UResult<()> {
if line.is_ascii() {
return process_ascii_line(line.as_bytes(), ctx);
}
let line_bytes = line.as_bytes();
let mut iter = line.char_indices().peekable();
while let Some((byte_idx, ch)) = iter.next() {
let next_idx = iter.peek().map(|(idx, _)| *idx).unwrap_or(line_bytes.len());
if ch == '\n' {
*ctx.last_space = None;
emit_output(ctx)?;
break;
}
if *ctx.col_count >= ctx.width {
emit_output(ctx)?;
}
if ch == '\r' {
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
*ctx.col_count = 0;
continue;
}
if ch == '\x08' {
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
*ctx.col_count = ctx.col_count.saturating_sub(1);
continue;
}
if ctx.mode == WidthMode::Columns && ch == '\t' {
loop {
let next_stop = next_tab_stop(*ctx.col_count);
if next_stop > ctx.width && !ctx.output.is_empty() {
emit_output(ctx)?;
continue;
}
*ctx.col_count = next_stop;
break;
}
if ctx.spaces {
*ctx.last_space = Some(ctx.output.len());
} else {
*ctx.last_space = None;
}
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
continue;
}
let added = match ctx.mode {
WidthMode::Columns => UnicodeWidthChar::width(ch).unwrap_or(0),
WidthMode::Characters => 1,
};
if ctx.mode == WidthMode::Columns
&& added > 0
&& *ctx.col_count + added > ctx.width
&& !ctx.output.is_empty()
{
emit_output(ctx)?;
}
if ctx.spaces && ch.is_ascii_whitespace() {
*ctx.last_space = Some(ctx.output.len());
}
ctx.output
.extend_from_slice(&line_bytes[byte_idx..next_idx]);
*ctx.col_count = ctx.col_count.saturating_add(added);
}
Ok(())
}
fn process_non_utf8_line<W: Write>(line: &[u8], ctx: &mut FoldContext<'_, W>) -> UResult<()> {
for &byte in line {
if byte == NL {
*ctx.last_space = None;
emit_output(ctx)?;
break;
}
if *ctx.col_count >= ctx.width {
emit_output(ctx)?;
}
match byte {
CR => *ctx.col_count = 0,
TAB => {
let next_stop = next_tab_stop(*ctx.col_count);
if next_stop > ctx.width && !ctx.output.is_empty() {
emit_output(ctx)?;
}
*ctx.col_count = next_stop;
*ctx.last_space = if ctx.spaces {
Some(ctx.output.len())
} else {
None
};
ctx.output.push(byte);
continue;
}
0x08 => *ctx.col_count = ctx.col_count.saturating_sub(1),
_ if ctx.spaces && byte.is_ascii_whitespace() => {
*ctx.last_space = Some(ctx.output.len());
*ctx.col_count = ctx.col_count.saturating_add(1);
}
_ => *ctx.col_count = ctx.col_count.saturating_add(1),
}
ctx.output.push(byte);
}
Ok(())
}
/// Fold `file` to fit `width` (number of columns).
///
/// By default `fold` treats tab, backspace, and carriage return specially:
@@ -226,6 +560,7 @@ fn fold_file<T: Read, W: Write>(
mut file: BufReader<T>,
spaces: bool,
width: usize,
mode: WidthMode,
writer: &mut W,
) -> UResult<()> {
let mut line = Vec::new();
@@ -233,30 +568,6 @@ fn fold_file<T: Read, W: Write>(
let mut col_count = 0;
let mut last_space = None;
/// Print the output line, resetting the column and character counts.
///
/// If `spaces` is `true`, print the output line up to the last
/// encountered whitespace character (inclusive) and set the remaining
/// characters as the start of the next line.
macro_rules! emit_output {
() => {
let consume = match last_space {
Some(i) => i + 1,
None => output.len(),
};
writer.write_all(&output[..consume])?;
writer.write_all(&[NL])?;
output.drain(..consume);
// we know there are no tabs left in output, so each char counts
// as 1 column
col_count = output.len();
last_space = None;
};
}
loop {
if file
.read_until(NL, &mut line)
@@ -266,50 +577,27 @@ fn fold_file<T: Read, W: Write>(
break;
}
for ch in &line {
if *ch == NL {
// make sure to _not_ split output at whitespace, since we
// know the entire output will fit
last_space = None;
emit_output!();
break;
}
let mut ctx = FoldContext {
spaces,
width,
mode,
writer,
output: &mut output,
col_count: &mut col_count,
last_space: &mut last_space,
};
if col_count >= width {
emit_output!();
}
match *ch {
CR => col_count = 0,
TAB => {
let next_tab_stop = col_count + TAB_WIDTH - col_count % TAB_WIDTH;
if next_tab_stop > width && !output.is_empty() {
emit_output!();
}
col_count = next_tab_stop;
last_space = if spaces { Some(output.len()) } else { None };
}
0x08 => {
col_count = col_count.saturating_sub(1);
}
_ if spaces && ch.is_ascii_whitespace() => {
last_space = Some(output.len());
col_count += 1;
}
_ => col_count += 1,
}
output.push(*ch);
match std::str::from_utf8(&line) {
Ok(s) => process_utf8_line(s, &mut ctx)?,
Err(_) => process_non_utf8_line(&line, &mut ctx)?,
}
if !output.is_empty() {
writer.write_all(&output)?;
output.truncate(0);
}
line.clear();
}
line.truncate(0);
if !output.is_empty() {
writer.write_all(&output)?;
output.clear();
}
Ok(())
+18
View File
@@ -41,6 +41,24 @@ fn test_default_wrap_with_newlines() {
.stdout_is_fixture("lorem_ipsum_new_line_80_column.expected");
}
#[test]
fn test_wide_characters_in_column_mode() {
new_ucmd!()
.args(&["-w", "5"])
.pipe_in("\u{B250}\u{B250}\u{B250}\n")
.succeeds()
.stdout_is("\u{B250}\u{B250}\n\u{B250}\n");
}
#[test]
fn test_wide_characters_with_characters_option() {
new_ucmd!()
.args(&["--characters", "-w", "5"])
.pipe_in("\u{B250}\u{B250}\u{B250}\n")
.succeeds()
.stdout_is("\u{B250}\u{B250}\u{B250}\n");
}
#[test]
fn test_should_preserve_empty_line_without_final_newline() {
new_ucmd!()