sort : gnu core utils test (sort-merge-fdlimit.sh) (#9849)

This commit is contained in:
mattsu
2026-01-19 06:46:17 +09:00
committed by GitHub
parent 525d1f88ec
commit 87c332c727
9 changed files with 606 additions and 304 deletions
Generated
+327 -279
View File
File diff suppressed because it is too large Load Diff
+2
View File
@@ -89,6 +89,8 @@ skip = [
{ name = "itertools", version = "0.13.0" },
# ordered-multimap
{ name = "hashbrown", version = "0.14.5" },
# lru (via num-prime)
{ name = "hashbrown", version = "0.15.5" },
# cexpr (via bindgen)
{ name = "nom", version = "7.1.3" },
# const-random-macro, rand_core
+3 -1
View File
@@ -27,7 +27,6 @@ bigdecimal = { workspace = true }
binary-heap-plus = { workspace = true }
clap = { workspace = true }
compare = { workspace = true }
ctrlc = { workspace = true }
fnv = { workspace = true }
itertools = { workspace = true }
memchr = { workspace = true }
@@ -46,6 +45,9 @@ uucore = { workspace = true, features = [
] }
fluent = { workspace = true }
[target.'cfg(not(target_os = "redox"))'.dependencies]
ctrlc = { workspace = true }
[target.'cfg(unix)'.dependencies]
nix = { workspace = true, features = ["resource"] }
+1
View File
@@ -85,6 +85,7 @@ sort-help-numeric = compare according to string numerical value
sort-help-general-numeric = compare according to string general numerical value
sort-help-version-sort = Sort by SemVer version number, eg 1.12.2 > 1.1.2
sort-help-random = shuffle in random order
sort-help-random-source = use FILE as a source of random data
sort-help-dictionary-order = consider only blanks and alphanumeric characters
sort-help-merge = merge already sorted files; do not sort
sort-help-check = check for sorted input; do not sort
+1
View File
@@ -69,6 +69,7 @@ sort-help-numeric = compare selon la valeur numérique de la chaîne
sort-help-general-numeric = compare selon la valeur numérique générale de la chaîne
sort-help-version-sort = Trie par numéro de version SemVer, par ex. 1.12.2 > 1.1.2
sort-help-random = mélange dans un ordre aléatoire
sort-help-random-source = utilise FICHIER comme source de données aléatoires
sort-help-dictionary-order = considère seulement les espaces et les caractères alphanumériques
sort-help-merge = fusionne les fichiers déjà triés ; ne trie pas
sort-help-check = vérifie l'entrée triée ; ne trie pas
+87 -9
View File
@@ -5,11 +5,13 @@
//! Utilities for reading files as chunks.
// spell-checker:ignore ELEMS
#![allow(dead_code)]
// Ignores non-used warning for `borrow_buffer` in `Chunk`
use std::{
io::{ErrorKind, Read},
ops::Range,
sync::mpsc::SyncSender,
};
@@ -17,7 +19,12 @@ use memchr::memchr_iter;
use self_cell::self_cell;
use uucore::error::{UResult, USimpleError};
use crate::{GeneralBigDecimalParseResult, GlobalSettings, Line, numeric_str_cmp::NumInfo};
use crate::{
GeneralBigDecimalParseResult, GlobalSettings, Line, SortMode, numeric_str_cmp::NumInfo,
};
const MAX_TOKEN_BUFFER_BYTES: usize = 4 * 1024 * 1024;
const MAX_TOKEN_BUFFER_ELEMS: usize = MAX_TOKEN_BUFFER_BYTES / std::mem::size_of::<Range<usize>>();
self_cell!(
/// The chunk that is passed around between threads.
@@ -35,6 +42,8 @@ self_cell!(
pub struct ChunkContents<'a> {
pub lines: Vec<Line<'a>>,
pub line_data: LineData<'a>,
pub token_buffer: Vec<Range<usize>>,
pub line_count_hint: usize,
}
#[derive(Debug)]
@@ -54,6 +63,7 @@ impl Chunk {
contents.line_data.num_infos.clear();
contents.line_data.parsed_floats.clear();
contents.line_data.line_num_floats.clear();
contents.token_buffer.clear();
let lines = unsafe {
// SAFETY: It is safe to (temporarily) transmute to a vector of lines with a longer lifetime,
// because the vector is empty.
@@ -76,6 +86,8 @@ impl Chunk {
std::mem::take(&mut contents.line_data.num_infos),
std::mem::take(&mut contents.line_data.parsed_floats),
std::mem::take(&mut contents.line_data.line_num_floats),
std::mem::take(&mut contents.token_buffer),
contents.line_count_hint,
)
});
RecycledChunk {
@@ -84,6 +96,8 @@ impl Chunk {
num_infos: recycled_contents.2,
parsed_floats: recycled_contents.3,
line_num_floats: recycled_contents.4,
token_buffer: recycled_contents.5,
line_count_hint: recycled_contents.6,
buffer: self.into_owner(),
}
}
@@ -103,6 +117,8 @@ pub struct RecycledChunk {
num_infos: Vec<NumInfo>,
parsed_floats: Vec<GeneralBigDecimalParseResult>,
line_num_floats: Vec<Option<f64>>,
token_buffer: Vec<Range<usize>>,
line_count_hint: usize,
buffer: Vec<u8>,
}
@@ -114,6 +130,8 @@ impl RecycledChunk {
num_infos: Vec::new(),
parsed_floats: Vec::new(),
line_num_floats: Vec::new(),
token_buffer: Vec::new(),
line_count_hint: 0,
buffer: vec![0; capacity],
}
}
@@ -157,6 +175,8 @@ pub fn read<T: Read>(
num_infos,
parsed_floats,
line_num_floats,
mut token_buffer,
mut line_count_hint,
mut buffer,
} = recycled_chunk;
if buffer.len() < carry_over.len() {
@@ -193,8 +213,21 @@ pub fn read<T: Read>(
parsed_floats,
line_num_floats,
};
parse_lines(read, &mut lines, &mut line_data, separator, settings);
Ok(ChunkContents { lines, line_data })
parse_lines(
read,
&mut lines,
&mut line_data,
&mut token_buffer,
&mut line_count_hint,
separator,
settings,
);
Ok(ChunkContents {
lines,
line_data,
token_buffer,
line_count_hint,
})
});
sender.send(payload?).unwrap();
}
@@ -206,6 +239,8 @@ fn parse_lines<'a>(
read: &'a [u8],
lines: &mut Vec<Line<'a>>,
line_data: &mut LineData<'a>,
token_buffer: &mut Vec<Range<usize>>,
line_count_hint: &mut usize,
separator: u8,
settings: &GlobalSettings,
) {
@@ -216,12 +251,55 @@ fn parse_lines<'a>(
assert!(line_data.num_infos.is_empty());
assert!(line_data.parsed_floats.is_empty());
assert!(line_data.line_num_floats.is_empty());
let mut token_buffer = vec![];
lines.extend(
read.split(|&c| c == separator)
.enumerate()
.map(|(index, line)| Line::create(line, index, line_data, &mut token_buffer, settings)),
);
token_buffer.clear();
if token_buffer.capacity() > MAX_TOKEN_BUFFER_ELEMS {
token_buffer.shrink_to(MAX_TOKEN_BUFFER_ELEMS);
}
const SMALL_CHUNK_BYTES: usize = 64 * 1024;
let mut estimated = (*line_count_hint).max(1);
let mut exact_line_count = None;
if *line_count_hint == 0 || read.len() <= SMALL_CHUNK_BYTES {
let count = if read.is_empty() {
1
} else {
memchr_iter(separator, read).count() + 1
};
exact_line_count = Some(count);
estimated = count;
} else if estimated == 1 {
const LINE_LEN_HINT: usize = 32;
estimated = (read.len() / LINE_LEN_HINT).max(1);
}
lines.reserve(estimated);
if settings.precomputed.selections_per_line > 0 {
line_data
.selections
.reserve(estimated.saturating_mul(settings.precomputed.selections_per_line));
}
if settings.precomputed.num_infos_per_line > 0 {
line_data
.num_infos
.reserve(estimated.saturating_mul(settings.precomputed.num_infos_per_line));
}
if settings.precomputed.floats_per_line > 0 {
line_data
.parsed_floats
.reserve(estimated.saturating_mul(settings.precomputed.floats_per_line));
}
if settings.mode == SortMode::Numeric {
line_data.line_num_floats.reserve(estimated);
}
let mut start = 0usize;
let mut index = 0usize;
for sep_idx in memchr_iter(separator, read) {
let line = &read[start..sep_idx];
lines.push(Line::create(line, index, line_data, token_buffer, settings));
index += 1;
start = sep_idx + 1;
}
let line = &read[start..];
lines.push(Line::create(line, index, line_data, token_buffer, settings));
*line_count_hint = exact_line_count.unwrap_or(index + 1);
}
/// Read from `file` into `buffer`.
+10 -5
View File
@@ -30,7 +30,7 @@ use uucore::error::{FromIo, UResult};
use crate::{
GlobalSettings, Output, SortError,
chunks::{self, Chunk, RecycledChunk},
compare_by, fd_soft_limit, open,
compare_by, current_open_fd_count, fd_soft_limit, open,
tmp_dir::TmpDirWrapper,
};
@@ -66,14 +66,19 @@ fn replace_output_file_in_input_files(
/// file-descriptor soft limit after reserving stdio/output and a safety margin.
fn effective_merge_batch_size(settings: &GlobalSettings) -> usize {
const MIN_BATCH_SIZE: usize = 2;
const RESERVED_STDIO: usize = 3;
const RESERVED_OUTPUT: usize = 1;
const RESERVED_TMP_OUTPUT: usize = 1;
const RESERVED_CTRL_C: usize = 2;
const RESERVED_RANDOM_SOURCE: usize = 1;
const SAFETY_MARGIN: usize = 1;
let mut batch_size = settings.merge_batch_size.max(MIN_BATCH_SIZE);
if let Some(limit) = fd_soft_limit() {
let reserved = RESERVED_STDIO + RESERVED_OUTPUT + SAFETY_MARGIN;
let available_inputs = limit.saturating_sub(reserved);
let open_fds = current_open_fd_count().unwrap_or(3);
let mut reserved = RESERVED_TMP_OUTPUT + RESERVED_CTRL_C + SAFETY_MARGIN;
if settings.salt.is_some() {
reserved = reserved.saturating_add(RESERVED_RANDOM_SOURCE);
}
let available_inputs = limit.saturating_sub(open_fds.saturating_add(reserved));
if available_inputs >= MIN_BATCH_SIZE {
batch_size = batch_size.min(available_inputs);
} else {
+154 -8
View File
@@ -7,7 +7,7 @@
// https://pubs.opengroup.org/onlinepubs/9699919799/utilities/sort.html
// https://www.gnu.org/software/coreutils/manual/html_node/sort-invocation.html
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef
// spell-checker:ignore (misc) HFKJFK Mbdfhn getrlimit RLIMIT_NOFILE rlim bigdecimal extendedbigdecimal hexdigit behaviour keydef GETFD
mod buffer_hint;
mod check;
@@ -104,6 +104,7 @@ mod options {
pub const TMP_DIR: &str = "temporary-directory";
pub const COMPRESS_PROG: &str = "compress-program";
pub const BATCH_SIZE: &str = "batch-size";
pub const RANDOM_SOURCE: &str = "random-source";
pub const FILES: &str = "files";
}
@@ -274,6 +275,7 @@ pub struct GlobalSettings {
check: bool,
check_silent: bool,
salt: Option<[u8; 16]>,
random_source: Option<PathBuf>,
selectors: Vec<FieldSelector>,
separator: Option<u8>,
threads: String,
@@ -402,6 +404,7 @@ impl Default for GlobalSettings {
check: false,
check_silent: false,
salt: None,
random_source: None,
selectors: vec![],
separator: None,
threads: String::new(),
@@ -584,6 +587,14 @@ impl<'a> Line<'a> {
token_buffer: &mut Vec<Field>,
settings: &GlobalSettings,
) -> Self {
let needs_line_data = settings.precomputed.needs_tokens
|| settings.precomputed.selections_per_line > 0
|| settings.precomputed.num_infos_per_line > 0
|| settings.precomputed.floats_per_line > 0
|| settings.mode == SortMode::Numeric;
if !needs_line_data {
return Self { line, index };
}
token_buffer.clear();
if settings.precomputed.needs_tokens {
tokenize(line, settings.separator, token_buffer);
@@ -1203,7 +1214,16 @@ fn make_sort_mode_arg(mode: &'static str, short: char, help: String) -> Arg {
.action(ArgAction::SetTrue)
}
#[cfg(target_os = "linux")]
#[cfg(all(
unix,
not(any(
target_os = "redox",
target_os = "fuchsia",
target_os = "haiku",
target_os = "solaris",
target_os = "illumos"
))
))]
fn get_rlimit() -> UResult<usize> {
use nix::sys::resource::{RLIM_INFINITY, Resource, getrlimit};
@@ -1216,16 +1236,74 @@ fn get_rlimit() -> UResult<usize> {
.map_err(|_| UUsageError::new(2, translate!("sort-failed-fetch-rlimit")))
}
#[cfg(target_os = "linux")]
#[cfg(all(
unix,
not(any(
target_os = "redox",
target_os = "fuchsia",
target_os = "haiku",
target_os = "solaris",
target_os = "illumos"
))
))]
pub(crate) fn fd_soft_limit() -> Option<usize> {
get_rlimit().ok()
}
#[cfg(not(target_os = "linux"))]
#[cfg(any(
not(unix),
target_os = "redox",
target_os = "fuchsia",
target_os = "haiku",
target_os = "solaris",
target_os = "illumos"
))]
pub(crate) fn fd_soft_limit() -> Option<usize> {
None
}
#[cfg(unix)]
pub(crate) fn current_open_fd_count() -> Option<usize> {
use nix::libc;
fn count_dir(path: &str) -> Option<usize> {
let entries = std::fs::read_dir(path).ok()?;
let mut count = 0usize;
for entry in entries.flatten() {
let name = entry.file_name();
let name = name.to_string_lossy();
if name.parse::<usize>().is_ok() {
count = count.saturating_add(1);
}
}
Some(count)
}
if let Some(count) = count_dir("/proc/self/fd").or_else(|| count_dir("/dev/fd")) {
return Some(count);
}
let limit = fd_soft_limit()?;
if limit > 16_384 {
return None;
}
let mut count = 0usize;
for fd in 0..limit {
let fd = fd as libc::c_int;
// Probe with libc::fcntl because the fd may be invalid.
if unsafe { libc::fcntl(fd, libc::F_GETFD) } != -1 {
count = count.saturating_add(1);
}
}
Some(count)
}
#[cfg(not(unix))]
pub(crate) fn current_open_fd_count() -> Option<usize> {
None
}
const STDIN_FILE: &str = "-";
/// Legacy `+POS1 [-POS2]` syntax is permitted unless `_POSIX2_VERSION` is in
@@ -1776,6 +1854,9 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
}
settings.debug = matches.get_flag(options::DEBUG);
if let Some(path) = matches.get_one::<OsString>(options::RANDOM_SOURCE) {
settings.random_source = Some(PathBuf::from(path));
}
// check whether user specified a zero terminated list of files for input, otherwise read files from args
let mut files: Vec<OsString> = if matches.contains_id(options::FILES0_FROM) {
@@ -2036,9 +2117,6 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
if let Some(values) = matches.get_many::<String>(options::KEY) {
for value in values {
let selector = FieldSelector::parse(value, &settings)?;
if selector.settings.mode == SortMode::Random && settings.salt.is_none() {
settings.salt = Some(get_rand_string());
}
settings.selectors.push(selector);
}
}
@@ -2060,6 +2138,18 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
);
}
let needs_random = settings.mode == SortMode::Random
|| settings
.selectors
.iter()
.any(|selector| selector.settings.mode == SortMode::Random);
if needs_random {
settings.salt = Some(match settings.random_source.as_deref() {
Some(path) => salt_from_random_source(path)?,
None => get_rand_string(),
});
}
// Verify that we can open all input files.
// It is the correct behavior to close all files afterwards,
// and to reopen them at a later point. This is different from how the output file is handled,
@@ -2158,6 +2248,14 @@ pub fn uu_app() -> Command {
'R',
translate!("sort-help-random"),
))
.arg(
Arg::new(options::RANDOM_SOURCE)
.long(options::RANDOM_SOURCE)
.help(translate!("sort-help-random-source"))
.value_name("FILE")
.value_parser(ValueParser::os_string())
.value_hint(clap::ValueHint::FilePath),
)
.arg(
Arg::new(options::DICTIONARY_ORDER)
.short('d')
@@ -2667,10 +2765,58 @@ fn general_numeric_compare(
a.partial_cmp(b).unwrap()
}
fn get_rand_string() -> [u8; 16] {
/// Generate a 128-bit salt from a uniform RNG distribution.
fn get_rand_string() -> [u8; SALT_LEN] {
rng().sample(rand::distr::StandardUniform)
}
const SALT_LEN: usize = 16; // 128-bit salt
const MAX_BYTES: usize = 1024 * 1024; // Read cap: 1 MiB
const BUF_LEN: usize = 8192; // 8 KiB read buffer
const U64_LEN: usize = 8;
const RANDOM_SOURCE_TAG: &[u8] = b"uutils-sort-random-source"; // Domain separation tag
/// Create a 128-bit salt by hashing up to 1 MiB from the given file.
fn salt_from_random_source(path: &Path) -> UResult<[u8; SALT_LEN]> {
let mut reader = open_with_open_failed_error(path)?;
let mut buf = [0u8; BUF_LEN];
let mut total = 0usize;
let mut hasher = FnvHasher::default();
loop {
let n = reader
.read(&mut buf)
.map_err(|error| SortError::ReadFailed {
path: path.to_owned(),
error,
})?;
if n == 0 {
break;
}
let remaining = MAX_BYTES.saturating_sub(total);
if remaining == 0 {
break;
}
let take = n.min(remaining);
hasher.write(&buf[..take]);
total = total.saturating_add(take);
if take < n {
break;
}
}
let first = hasher.finish();
let mut second_hasher = FnvHasher::default();
second_hasher.write(RANDOM_SOURCE_TAG);
second_hasher.write_u64(first);
let second = second_hasher.finish();
let mut out = [0u8; SALT_LEN];
out[..U64_LEN].copy_from_slice(&first.to_le_bytes());
out[U64_LEN..].copy_from_slice(&second.to_le_bytes());
Ok(out)
}
fn get_hash<T: Hash>(t: &T) -> u64 {
let mut s = FnvHasher::default();
t.hash(&mut s);
+21 -2
View File
@@ -15,7 +15,7 @@ use uucore::{
show_error, translate,
};
use crate::SortError;
use crate::{SortError, current_open_fd_count, fd_soft_limit};
/// A wrapper around [`TempDir`] that may only exist once in a process.
///
@@ -45,6 +45,17 @@ fn handler_state() -> Arc<Mutex<HandlerRegistration>> {
.clone()
}
fn should_install_signal_handler() -> bool {
const CTRL_C_FDS: usize = 2;
const RESERVED_FOR_MERGE: usize = 3; // temp output + minimum inputs
let Some(limit) = fd_soft_limit() else {
return true;
};
let open_fds = current_open_fd_count().unwrap_or(3);
open_fds.saturating_add(CTRL_C_FDS + RESERVED_FOR_MERGE) <= limit
}
#[cfg(not(target_os = "redox"))]
fn ensure_signal_handler_installed(state: Arc<Mutex<HandlerRegistration>>) -> UResult<()> {
// This shared state must originate from `handler_state()` so the handler always sees
// the current lock/path pair and can clean up the active temp directory on SIGINT.
@@ -94,6 +105,11 @@ fn ensure_signal_handler_installed(state: Arc<Mutex<HandlerRegistration>>) -> UR
Ok(())
}
#[cfg(target_os = "redox")]
fn ensure_signal_handler_installed(_state: Arc<Mutex<HandlerRegistration>>) -> UResult<()> {
Ok(())
}
impl TmpDirWrapper {
pub fn new(path: PathBuf) -> Self {
Self {
@@ -124,7 +140,10 @@ impl TmpDirWrapper {
guard.path = Some(path);
}
ensure_signal_handler_installed(state)
if should_install_signal_handler() {
ensure_signal_handler_installed(state)?;
}
Ok(())
}
pub fn next_file(&mut self) -> UResult<(File, PathBuf)> {