gnu-wc.c (30690B)
1 /* wc - print the number of lines, words, and bytes in files 2 Copyright (C) 1985-2023 Free Software Foundation, Inc. 3 4 This program is free software: you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation, either version 3 of the License, or 7 (at your option) any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program. If not, see <https://www.gnu.org/licenses/>. */ 16 17 /* Written by Paul Rubin, phr@ocf.berkeley.edu 18 and David MacKenzie, djm@gnu.ai.mit.edu. */ 19 20 #include <config.h> 21 22 #include <stdckdint.h> 23 #include <stdio.h> 24 #include <getopt.h> 25 #include <sys/types.h> 26 #include <wchar.h> 27 #include <wctype.h> 28 29 #include "system.h" 30 #include "assure.h" 31 #include "argmatch.h" 32 #include "argv-iter.h" 33 #include "fadvise.h" 34 #include "mbchar.h" 35 #include "physmem.h" 36 #include "readtokens0.h" 37 #include "safe-read.h" 38 #include "stat-size.h" 39 #include "xbinary-io.h" 40 41 #if !defined iswspace && !HAVE_ISWSPACE 42 # define iswspace(wc) \ 43 ((wc) == to_uchar (wc) && isspace (to_uchar (wc))) 44 #endif 45 46 /* The official name of this program (e.g., no 'g' prefix). */ 47 #define PROGRAM_NAME "wc" 48 49 #define AUTHORS \ 50 proper_name ("Paul Rubin"), \ 51 proper_name ("David MacKenzie") 52 53 /* Size of atomic reads. */ 54 #define BUFFER_SIZE (16 * 1024) 55 56 #ifdef USE_AVX2_WC_LINECOUNT 57 /* From wc_avx2.c */ 58 extern bool 59 wc_lines_avx2 (char const *file, int fd, uintmax_t *lines_out, 60 uintmax_t *bytes_out); 61 #endif 62 63 static bool debug; 64 65 /* Cumulative number of lines, words, chars and bytes in all files so far. 66 max_line_length is the maximum over all files processed so far. */ 67 static uintmax_t total_lines; 68 static uintmax_t total_words; 69 static uintmax_t total_chars; 70 static uintmax_t total_bytes; 71 static uintmax_t total_lines_overflow; 72 static uintmax_t total_words_overflow; 73 static uintmax_t total_chars_overflow; 74 static uintmax_t total_bytes_overflow; 75 static uintmax_t max_line_length; 76 77 /* Which counts to print. */ 78 static bool print_lines, print_words, print_chars, print_bytes; 79 static bool print_linelength; 80 81 /* The print width of each count. */ 82 static int number_width; 83 84 /* True if we have ever read the standard input. */ 85 static bool have_read_stdin; 86 87 /* Used to determine if file size can be determined without reading. */ 88 static size_t page_size; 89 90 /* Enable to _not_ treat non breaking space as a word separator. */ 91 static bool posixly_correct; 92 93 /* The result of calling fstat or stat on a file descriptor or file. */ 94 struct fstatus 95 { 96 /* If positive, fstat or stat has not been called yet. Otherwise, 97 this is the value returned from fstat or stat. */ 98 int failed; 99 100 /* If FAILED is zero, this is the file's status. */ 101 struct stat st; 102 }; 103 104 /* For long options that have no equivalent short option, use a 105 non-character as a pseudo short option, starting with CHAR_MAX + 1. */ 106 enum 107 { 108 DEBUG_PROGRAM_OPTION = CHAR_MAX + 1, 109 FILES0_FROM_OPTION, 110 TOTAL_OPTION, 111 }; 112 113 static struct option const longopts[] = 114 { 115 {"bytes", no_argument, nullptr, 'c'}, 116 {"chars", no_argument, nullptr, 'm'}, 117 {"lines", no_argument, nullptr, 'l'}, 118 {"words", no_argument, nullptr, 'w'}, 119 {"debug", no_argument, nullptr, DEBUG_PROGRAM_OPTION}, 120 {"files0-from", required_argument, nullptr, FILES0_FROM_OPTION}, 121 {"max-line-length", no_argument, nullptr, 'L'}, 122 {"total", required_argument, nullptr, TOTAL_OPTION}, 123 {GETOPT_HELP_OPTION_DECL}, 124 {GETOPT_VERSION_OPTION_DECL}, 125 {nullptr, 0, nullptr, 0} 126 }; 127 128 enum total_type 129 { 130 total_auto, /* 0: default or --total=auto */ 131 total_always, /* 1: --total=always */ 132 total_only, /* 2: --total=only */ 133 total_never /* 3: --total=never */ 134 }; 135 static char const *const total_args[] = 136 { 137 "auto", "always", "only", "never", nullptr 138 }; 139 static enum total_type const total_types[] = 140 { 141 total_auto, total_always, total_only, total_never 142 }; 143 ARGMATCH_VERIFY (total_args, total_types); 144 static enum total_type total_mode = total_auto; 145 146 #ifdef USE_AVX2_WC_LINECOUNT 147 static bool 148 avx2_supported (void) 149 { 150 bool avx_enabled = 0 < __builtin_cpu_supports ("avx2"); 151 152 if (debug) 153 error (0, 0, (avx_enabled 154 ? _("using avx2 hardware support") 155 : _("avx2 support not detected"))); 156 157 return avx_enabled; 158 } 159 #endif 160 161 void 162 usage (int status) 163 { 164 if (status != EXIT_SUCCESS) 165 emit_try_help (); 166 else 167 { 168 printf (_("\ 169 Usage: %s [OPTION]... [FILE]...\n\ 170 or: %s [OPTION]... --files0-from=F\n\ 171 "), 172 program_name, program_name); 173 fputs (_("\ 174 Print newline, word, and byte counts for each FILE, and a total line if\n\ 175 more than one FILE is specified. A word is a non-zero-length sequence of\n\ 176 printable characters delimited by white space.\n\ 177 "), stdout); 178 179 emit_stdin_note (); 180 181 fputs (_("\ 182 \n\ 183 The options below may be used to select which counts are printed, always in\n\ 184 the following order: newline, word, character, byte, maximum line length.\n\ 185 -c, --bytes print the byte counts\n\ 186 -m, --chars print the character counts\n\ 187 -l, --lines print the newline counts\n\ 188 "), stdout); 189 fputs (_("\ 190 --files0-from=F read input from the files specified by\n\ 191 NUL-terminated names in file F;\n\ 192 If F is - then read names from standard input\n\ 193 -L, --max-line-length print the maximum display width\n\ 194 -w, --words print the word counts\n\ 195 "), stdout); 196 fputs (_("\ 197 --total=WHEN when to print a line with total counts;\n\ 198 WHEN can be: auto, always, only, never\n\ 199 "), stdout); 200 fputs (HELP_OPTION_DESCRIPTION, stdout); 201 fputs (VERSION_OPTION_DESCRIPTION, stdout); 202 emit_ancillary_info (PROGRAM_NAME); 203 } 204 exit (status); 205 } 206 207 /* Return non zero if a non breaking space. */ 208 ATTRIBUTE_PURE 209 static int 210 iswnbspace (wint_t wc) 211 { 212 return ! posixly_correct 213 && (wc == 0x00A0 || wc == 0x2007 214 || wc == 0x202F || wc == 0x2060); 215 } 216 217 static int 218 isnbspace (int c) 219 { 220 return iswnbspace (btowc (c)); 221 } 222 223 /* FILE is the name of the file (or null for standard input) 224 associated with the specified counters. */ 225 static void 226 write_counts (uintmax_t lines, 227 uintmax_t words, 228 uintmax_t chars, 229 uintmax_t bytes, 230 uintmax_t linelength, 231 char const *file) 232 { 233 static char const format_sp_int[] = " %*s"; 234 char const *format_int = format_sp_int + 1; 235 char buf[INT_BUFSIZE_BOUND (uintmax_t)]; 236 237 if (print_lines) 238 { 239 printf (format_int, number_width, umaxtostr (lines, buf)); 240 format_int = format_sp_int; 241 } 242 if (print_words) 243 { 244 printf (format_int, number_width, umaxtostr (words, buf)); 245 format_int = format_sp_int; 246 } 247 if (print_chars) 248 { 249 printf (format_int, number_width, umaxtostr (chars, buf)); 250 format_int = format_sp_int; 251 } 252 if (print_bytes) 253 { 254 printf (format_int, number_width, umaxtostr (bytes, buf)); 255 format_int = format_sp_int; 256 } 257 if (print_linelength) 258 { 259 printf (format_int, number_width, umaxtostr (linelength, buf)); 260 } 261 if (file) 262 printf (" %s", strchr (file, '\n') ? quotef (file) : file); 263 putchar ('\n'); 264 } 265 266 static bool 267 wc_lines (char const *file, int fd, uintmax_t *lines_out, uintmax_t *bytes_out) 268 { 269 size_t bytes_read; 270 uintmax_t lines, bytes; 271 char buf[BUFFER_SIZE + 1]; 272 bool long_lines = false; 273 274 if (!lines_out || !bytes_out) 275 { 276 return false; 277 } 278 279 lines = bytes = 0; 280 281 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) 282 { 283 284 if (bytes_read == SAFE_READ_ERROR) 285 { 286 error (0, errno, "%s", quotef (file)); 287 return false; 288 } 289 290 bytes += bytes_read; 291 292 char *p = buf; 293 char *end = buf + bytes_read; 294 uintmax_t plines = lines; 295 296 if (! long_lines) 297 { 298 /* Avoid function call overhead for shorter lines. */ 299 while (p != end) 300 lines += *p++ == '\n'; 301 } 302 else 303 { 304 /* rawmemchr is more efficient with longer lines. */ 305 *end = '\n'; 306 while ((p = rawmemchr (p, '\n')) < end) 307 { 308 ++p; 309 ++lines; 310 } 311 } 312 313 /* If the average line length in the block is >= 15, then use 314 memchr for the next block, where system specific optimizations 315 may outweigh function call overhead. 316 FIXME: This line length was determined in 2015, on both 317 x86_64 and ppc64, but it's worth re-evaluating in future with 318 newer compilers, CPUs, or memchr() implementations etc. */ 319 if (lines - plines <= bytes_read / 15) 320 long_lines = true; 321 else 322 long_lines = false; 323 } 324 325 *bytes_out = bytes; 326 *lines_out = lines; 327 328 return true; 329 } 330 331 /* Count words. FILE_X is the name of the file (or null for standard 332 input) that is open on descriptor FD. *FSTATUS is its status. 333 CURRENT_POS is the current file offset if known, negative if unknown. 334 Return true if successful. */ 335 static bool 336 wc (int fd, char const *file_x, struct fstatus *fstatus, off_t current_pos) 337 { 338 bool ok = true; 339 char buf[BUFFER_SIZE + 1]; 340 size_t bytes_read; 341 uintmax_t lines, words, chars, bytes, linelength; 342 bool count_bytes, count_chars, count_complicated; 343 char const *file = file_x ? file_x : _("standard input"); 344 345 lines = words = chars = bytes = linelength = 0; 346 347 /* If in the current locale, chars are equivalent to bytes, we prefer 348 counting bytes, because that's easier. */ 349 #if MB_LEN_MAX > 1 350 if (MB_CUR_MAX > 1) 351 { 352 count_bytes = print_bytes; 353 count_chars = print_chars; 354 } 355 else 356 #endif 357 { 358 count_bytes = print_bytes || print_chars; 359 count_chars = false; 360 } 361 count_complicated = print_words || print_linelength; 362 363 /* Advise the kernel of our access pattern only if we will read(). */ 364 if (!count_bytes || count_chars || print_lines || count_complicated) 365 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL); 366 367 /* When counting only bytes, save some line- and word-counting 368 overhead. If FD is a 'regular' Unix file, using lseek is enough 369 to get its 'size' in bytes. Otherwise, read blocks of BUFFER_SIZE 370 bytes at a time until EOF. Note that the 'size' (number of bytes) 371 that wc reports is smaller than stats.st_size when the file is not 372 positioned at its beginning. That's why the lseek calls below are 373 necessary. For example the command 374 '(dd ibs=99k skip=1 count=0; ./wc -c) < /etc/group' 375 should make wc report '0' bytes. */ 376 377 if (count_bytes && !count_chars && !print_lines && !count_complicated) 378 { 379 bool skip_read = false; 380 381 if (0 < fstatus->failed) 382 fstatus->failed = fstat (fd, &fstatus->st); 383 384 /* For sized files, seek to one st_blksize before EOF rather than to EOF. 385 This works better for files in proc-like file systems where 386 the size is only approximate. */ 387 if (! fstatus->failed && usable_st_size (&fstatus->st) 388 && 0 <= fstatus->st.st_size) 389 { 390 off_t end_pos = fstatus->st.st_size; 391 if (current_pos < 0) 392 current_pos = lseek (fd, 0, SEEK_CUR); 393 394 if (end_pos % page_size) 395 { 396 /* We only need special handling of /proc and /sys files etc. 397 when they're a multiple of PAGE_SIZE. In the common case 398 for files with st_size not a multiple of PAGE_SIZE, 399 it's more efficient and accurate to use st_size. 400 401 Be careful here. The current position may actually be 402 beyond the end of the file. As in the example above. */ 403 404 bytes = end_pos < current_pos ? 0 : end_pos - current_pos; 405 if (bytes && 0 <= lseek (fd, bytes, SEEK_CUR)) 406 skip_read = true; 407 else 408 bytes = 0; 409 } 410 else 411 { 412 off_t hi_pos = (end_pos 413 - end_pos % (STP_BLKSIZE (&fstatus->st) + 1)); 414 if (0 <= current_pos && current_pos < hi_pos 415 && 0 <= lseek (fd, hi_pos, SEEK_CUR)) 416 bytes = hi_pos - current_pos; 417 } 418 } 419 420 if (! skip_read) 421 { 422 fdadvise (fd, 0, 0, FADVISE_SEQUENTIAL); 423 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) 424 { 425 if (bytes_read == SAFE_READ_ERROR) 426 { 427 error (0, errno, "%s", quotef (file)); 428 ok = false; 429 break; 430 } 431 bytes += bytes_read; 432 } 433 } 434 } 435 else if (!count_chars && !count_complicated) 436 { 437 #ifdef USE_AVX2_WC_LINECOUNT 438 static bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *); 439 if (!wc_lines_p) 440 wc_lines_p = avx2_supported () ? wc_lines_avx2 : wc_lines; 441 #else 442 bool (*wc_lines_p) (char const *, int, uintmax_t *, uintmax_t *) 443 = wc_lines; 444 #endif 445 446 /* Use a separate loop when counting only lines or lines and bytes -- 447 but not chars or words. */ 448 ok = wc_lines_p (file, fd, &lines, &bytes); 449 } 450 #if MB_LEN_MAX > 1 451 # define SUPPORT_OLD_MBRTOWC 1 452 else if (MB_CUR_MAX > 1) 453 { 454 bool in_word = false; 455 uintmax_t linepos = 0; 456 mbstate_t state = {0}; 457 bool in_shift = false; 458 # if SUPPORT_OLD_MBRTOWC 459 /* Back-up the state before each multibyte character conversion and 460 move the last incomplete character of the buffer to the front 461 of the buffer. This is needed because we don't know whether 462 the 'mbrtowc' function updates the state when it returns -2, -- 463 this is the ISO C 99 and glibc-2.2 behavior - or not - amended 464 ANSI C, glibc-2.1 and Solaris 5.7 behavior. We don't have an 465 autoconf test for this, yet. */ 466 size_t prev = 0; /* number of bytes carried over from previous round */ 467 # else 468 const size_t prev = 0; 469 # endif 470 471 while ((bytes_read = safe_read (fd, buf + prev, BUFFER_SIZE - prev)) > 0) 472 { 473 char const *p; 474 # if SUPPORT_OLD_MBRTOWC 475 mbstate_t backup_state; 476 # endif 477 if (bytes_read == SAFE_READ_ERROR) 478 { 479 error (0, errno, "%s", quotef (file)); 480 ok = false; 481 break; 482 } 483 484 bytes += bytes_read; 485 p = buf; 486 bytes_read += prev; 487 do 488 { 489 wchar_t wide_char; 490 size_t n; 491 bool wide = true; 492 493 if (!in_shift && is_basic (*p)) 494 { 495 /* Handle most ASCII characters quickly, without calling 496 mbrtowc(). */ 497 n = 1; 498 wide_char = *p; 499 wide = false; 500 } 501 else 502 { 503 in_shift = true; 504 # if SUPPORT_OLD_MBRTOWC 505 backup_state = state; 506 # endif 507 n = mbrtowc (&wide_char, p, bytes_read, &state); 508 if (n == (size_t) -2) 509 { 510 # if SUPPORT_OLD_MBRTOWC 511 state = backup_state; 512 # endif 513 break; 514 } 515 if (n == (size_t) -1) 516 { 517 /* Remember that we read a byte, but don't complain 518 about the error. Because of the decoding error, 519 this is a considered to be byte but not a 520 character (that is, chars is not incremented). */ 521 p++; 522 bytes_read--; 523 continue; 524 } 525 if (mbsinit (&state)) 526 in_shift = false; 527 if (n == 0) 528 { 529 wide_char = 0; 530 n = 1; 531 } 532 } 533 534 switch (wide_char) 535 { 536 case '\n': 537 lines++; 538 FALLTHROUGH; 539 case '\r': 540 case '\f': 541 if (linepos > linelength) 542 linelength = linepos; 543 linepos = 0; 544 goto mb_word_separator; 545 case '\t': 546 linepos += 8 - (linepos % 8); 547 goto mb_word_separator; 548 case ' ': 549 linepos++; 550 FALLTHROUGH; 551 case '\v': 552 mb_word_separator: 553 words += in_word; 554 in_word = false; 555 break; 556 default: 557 if (wide && iswprint (wide_char)) 558 { 559 /* wcwidth can be expensive on OSX for example, 560 so avoid if not needed. */ 561 if (print_linelength) 562 { 563 int width = wcwidth (wide_char); 564 if (width > 0) 565 linepos += width; 566 } 567 if (iswspace (wide_char) || iswnbspace (wide_char)) 568 goto mb_word_separator; 569 in_word = true; 570 } 571 else if (!wide && isprint (to_uchar (*p))) 572 { 573 linepos++; 574 if (isspace (to_uchar (*p))) 575 goto mb_word_separator; 576 in_word = true; 577 } 578 break; 579 } 580 581 p += n; 582 bytes_read -= n; 583 chars++; 584 } 585 while (bytes_read > 0); 586 587 # if SUPPORT_OLD_MBRTOWC 588 if (bytes_read > 0) 589 { 590 if (bytes_read == BUFFER_SIZE) 591 { 592 /* Encountered a very long redundant shift sequence. */ 593 p++; 594 bytes_read--; 595 } 596 memmove (buf, p, bytes_read); 597 } 598 prev = bytes_read; 599 # endif 600 } 601 if (linepos > linelength) 602 linelength = linepos; 603 words += in_word; 604 } 605 #endif 606 else 607 { 608 bool in_word = false; 609 uintmax_t linepos = 0; 610 611 while ((bytes_read = safe_read (fd, buf, BUFFER_SIZE)) > 0) 612 { 613 char const *p = buf; 614 if (bytes_read == SAFE_READ_ERROR) 615 { 616 error (0, errno, "%s", quotef (file)); 617 ok = false; 618 break; 619 } 620 621 bytes += bytes_read; 622 do 623 { 624 switch (*p++) 625 { 626 case '\n': 627 lines++; 628 FALLTHROUGH; 629 case '\r': 630 case '\f': 631 if (linepos > linelength) 632 linelength = linepos; 633 linepos = 0; 634 goto word_separator; 635 case '\t': 636 linepos += 8 - (linepos % 8); 637 goto word_separator; 638 case ' ': 639 linepos++; 640 FALLTHROUGH; 641 case '\v': 642 word_separator: 643 words += in_word; 644 in_word = false; 645 break; 646 default: 647 if (isprint (to_uchar (p[-1]))) 648 { 649 linepos++; 650 if (isspace (to_uchar (p[-1])) 651 || isnbspace (to_uchar (p[-1]))) 652 goto word_separator; 653 in_word = true; 654 } 655 break; 656 } 657 } 658 while (--bytes_read); 659 } 660 if (linepos > linelength) 661 linelength = linepos; 662 words += in_word; 663 } 664 665 if (count_chars < print_chars) 666 chars = bytes; 667 668 if (total_mode != total_only) 669 write_counts (lines, words, chars, bytes, linelength, file_x); 670 671 if (ckd_add (&total_lines, total_lines, lines)) 672 total_lines_overflow = true; 673 if (ckd_add (&total_words, total_words, words)) 674 total_words_overflow = true; 675 if (ckd_add (&total_chars, total_chars, chars)) 676 total_chars_overflow = true; 677 if (ckd_add (&total_bytes, total_bytes, bytes)) 678 total_bytes_overflow = true; 679 680 if (linelength > max_line_length) 681 max_line_length = linelength; 682 683 return ok; 684 } 685 686 static bool 687 wc_file (char const *file, struct fstatus *fstatus) 688 { 689 if (! file || STREQ (file, "-")) 690 { 691 have_read_stdin = true; 692 xset_binary_mode (STDIN_FILENO, O_BINARY); 693 return wc (STDIN_FILENO, file, fstatus, -1); 694 } 695 else 696 { 697 int fd = open (file, O_RDONLY | O_BINARY); 698 if (fd == -1) 699 { 700 error (0, errno, "%s", quotef (file)); 701 return false; 702 } 703 else 704 { 705 bool ok = wc (fd, file, fstatus, 0); 706 if (close (fd) != 0) 707 { 708 error (0, errno, "%s", quotef (file)); 709 return false; 710 } 711 return ok; 712 } 713 } 714 } 715 716 /* Return the file status for the NFILES files addressed by FILE. 717 Optimize the case where only one number is printed, for just one 718 file; in that case we can use a print width of 1, so we don't need 719 to stat the file. Handle the case of (nfiles == 0) in the same way; 720 that happens when we don't know how long the list of file names will be. */ 721 722 static struct fstatus * 723 get_input_fstatus (size_t nfiles, char *const *file) 724 { 725 struct fstatus *fstatus = xnmalloc (nfiles ? nfiles : 1, sizeof *fstatus); 726 727 if (nfiles == 0 728 || (nfiles == 1 729 && ((print_lines + print_words + print_chars 730 + print_bytes + print_linelength) 731 == 1))) 732 fstatus[0].failed = 1; 733 else 734 { 735 for (size_t i = 0; i < nfiles; i++) 736 fstatus[i].failed = (! file[i] || STREQ (file[i], "-") 737 ? fstat (STDIN_FILENO, &fstatus[i].st) 738 : stat (file[i], &fstatus[i].st)); 739 } 740 741 return fstatus; 742 } 743 744 /* Return a print width suitable for the NFILES files whose status is 745 recorded in FSTATUS. Optimize the same special case that 746 get_input_fstatus optimizes. */ 747 748 ATTRIBUTE_PURE 749 static int 750 compute_number_width (size_t nfiles, struct fstatus const *fstatus) 751 { 752 int width = 1; 753 754 if (0 < nfiles && fstatus[0].failed <= 0) 755 { 756 int minimum_width = 1; 757 uintmax_t regular_total = 0; 758 759 for (size_t i = 0; i < nfiles; i++) 760 if (! fstatus[i].failed) 761 { 762 if (S_ISREG (fstatus[i].st.st_mode)) 763 regular_total += fstatus[i].st.st_size; 764 else 765 minimum_width = 7; 766 } 767 768 for (; 10 <= regular_total; regular_total /= 10) 769 width++; 770 if (width < minimum_width) 771 width = minimum_width; 772 } 773 774 return width; 775 } 776 777 778 int 779 main (int argc, char **argv) 780 { 781 bool ok; 782 int optc; 783 size_t nfiles; 784 char **files; 785 char *files_from = nullptr; 786 struct fstatus *fstatus; 787 struct Tokens tok; 788 789 initialize_main (&argc, &argv); 790 set_program_name (argv[0]); 791 setlocale (LC_ALL, ""); 792 bindtextdomain (PACKAGE, LOCALEDIR); 793 textdomain (PACKAGE); 794 795 atexit (close_stdout); 796 797 page_size = getpagesize (); 798 /* Line buffer stdout to ensure lines are written atomically and immediately 799 so that processes running in parallel do not intersperse their output. */ 800 setvbuf (stdout, nullptr, _IOLBF, 0); 801 802 posixly_correct = (getenv ("POSIXLY_CORRECT") != nullptr); 803 804 print_lines = print_words = print_chars = print_bytes = false; 805 print_linelength = false; 806 total_lines = total_words = total_chars = total_bytes = max_line_length = 0; 807 808 while ((optc = getopt_long (argc, argv, "clLmw", longopts, nullptr)) != -1) 809 switch (optc) 810 { 811 case 'c': 812 print_bytes = true; 813 break; 814 815 case 'm': 816 print_chars = true; 817 break; 818 819 case 'l': 820 print_lines = true; 821 break; 822 823 case 'w': 824 print_words = true; 825 break; 826 827 case 'L': 828 print_linelength = true; 829 break; 830 831 case DEBUG_PROGRAM_OPTION: 832 debug = true; 833 break; 834 835 case FILES0_FROM_OPTION: 836 files_from = optarg; 837 break; 838 839 case TOTAL_OPTION: 840 total_mode = XARGMATCH ("--total", optarg, total_args, total_types); 841 break; 842 843 case_GETOPT_HELP_CHAR; 844 845 case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); 846 847 default: 848 usage (EXIT_FAILURE); 849 } 850 851 if (! (print_lines || print_words || print_chars || print_bytes 852 || print_linelength)) 853 print_lines = print_words = print_bytes = true; 854 855 bool read_tokens = false; 856 struct argv_iterator *ai; 857 if (files_from) 858 { 859 FILE *stream; 860 861 /* When using --files0-from=F, you may not specify any files 862 on the command-line. */ 863 if (optind < argc) 864 { 865 error (0, 0, _("extra operand %s"), quoteaf (argv[optind])); 866 fprintf (stderr, "%s\n", 867 _("file operands cannot be combined with --files0-from")); 868 usage (EXIT_FAILURE); 869 } 870 871 if (STREQ (files_from, "-")) 872 stream = stdin; 873 else 874 { 875 stream = fopen (files_from, "r"); 876 if (stream == nullptr) 877 error (EXIT_FAILURE, errno, _("cannot open %s for reading"), 878 quoteaf (files_from)); 879 } 880 881 /* Read the file list into RAM if we can detect its size and that 882 size is reasonable. Otherwise, we'll read a name at a time. */ 883 struct stat st; 884 if (fstat (fileno (stream), &st) == 0 885 && S_ISREG (st.st_mode) 886 && st.st_size <= MIN (10 * 1024 * 1024, physmem_available () / 2)) 887 { 888 read_tokens = true; 889 readtokens0_init (&tok); 890 if (! readtokens0 (stream, &tok) || fclose (stream) != 0) 891 error (EXIT_FAILURE, 0, _("cannot read file names from %s"), 892 quoteaf (files_from)); 893 files = tok.tok; 894 nfiles = tok.n_tok; 895 ai = argv_iter_init_argv (files); 896 } 897 else 898 { 899 files = nullptr; 900 nfiles = 0; 901 ai = argv_iter_init_stream (stream); 902 } 903 } 904 else 905 { 906 static char *stdin_only[] = { nullptr }; 907 files = (optind < argc ? argv + optind : stdin_only); 908 nfiles = (optind < argc ? argc - optind : 1); 909 ai = argv_iter_init_argv (files); 910 } 911 912 if (!ai) 913 xalloc_die (); 914 915 fstatus = get_input_fstatus (nfiles, files); 916 if (total_mode == total_only) 917 number_width = 1; /* No extra padding, since no alignment requirement. */ 918 else 919 number_width = compute_number_width (nfiles, fstatus); 920 921 ok = true; 922 for (int i = 0; /* */; i++) 923 { 924 bool skip_file = false; 925 enum argv_iter_err ai_err; 926 char *file_name = argv_iter (ai, &ai_err); 927 if (!file_name) 928 { 929 switch (ai_err) 930 { 931 case AI_ERR_EOF: 932 goto argv_iter_done; 933 case AI_ERR_READ: 934 error (0, errno, _("%s: read error"), 935 quotef (files_from)); 936 ok = false; 937 goto argv_iter_done; 938 case AI_ERR_MEM: 939 xalloc_die (); 940 default: 941 affirm (!"unexpected error code from argv_iter"); 942 } 943 } 944 if (files_from && STREQ (files_from, "-") && STREQ (file_name, "-")) 945 { 946 /* Give a better diagnostic in an unusual case: 947 printf - | wc --files0-from=- */ 948 error (0, 0, _("when reading file names from stdin, " 949 "no file name of %s allowed"), 950 quoteaf (file_name)); 951 skip_file = true; 952 } 953 954 if (!file_name[0]) 955 { 956 /* Diagnose a zero-length file name. When it's one 957 among many, knowing the record number may help. 958 FIXME: currently print the record number only with 959 --files0-from=FILE. Maybe do it for argv, too? */ 960 if (files_from == nullptr) 961 error (0, 0, "%s", _("invalid zero-length file name")); 962 else 963 { 964 /* Using the standard 'filename:line-number:' prefix here is 965 not totally appropriate, since NUL is the separator, not NL, 966 but it might be better than nothing. */ 967 unsigned long int file_number = argv_iter_n_args (ai); 968 error (0, 0, "%s:%lu: %s", quotef (files_from), 969 file_number, _("invalid zero-length file name")); 970 } 971 skip_file = true; 972 } 973 974 if (skip_file) 975 ok = false; 976 else 977 ok &= wc_file (file_name, &fstatus[nfiles ? i : 0]); 978 979 if (! nfiles) 980 fstatus[0].failed = 1; 981 } 982 argv_iter_done: 983 984 /* No arguments on the command line is fine. That means read from stdin. 985 However, no arguments on the --files0-from input stream is an error 986 means don't read anything. */ 987 if (ok && !files_from && argv_iter_n_args (ai) == 0) 988 ok &= wc_file (nullptr, &fstatus[0]); 989 990 if (read_tokens) 991 readtokens0_free (&tok); 992 993 if (total_mode != total_never 994 && (total_mode != total_auto || 1 < argv_iter_n_args (ai))) 995 { 996 if (total_lines_overflow) 997 { 998 total_lines = UINTMAX_MAX; 999 error (0, EOVERFLOW, _("total lines")); 1000 ok = false; 1001 } 1002 if (total_words_overflow) 1003 { 1004 total_words = UINTMAX_MAX; 1005 error (0, EOVERFLOW, _("total words")); 1006 ok = false; 1007 } 1008 if (total_chars_overflow) 1009 { 1010 total_chars = UINTMAX_MAX; 1011 error (0, EOVERFLOW, _("total characters")); 1012 ok = false; 1013 } 1014 if (total_bytes_overflow) 1015 { 1016 total_bytes = UINTMAX_MAX; 1017 error (0, EOVERFLOW, _("total bytes")); 1018 ok = false; 1019 } 1020 1021 write_counts (total_lines, total_words, total_chars, total_bytes, 1022 max_line_length, 1023 total_mode != total_only ? _("total") : nullptr); 1024 } 1025 1026 argv_iter_free (ai); 1027 1028 free (fstatus); 1029 1030 if (have_read_stdin && close (STDIN_FILENO) != 0) 1031 error (EXIT_FAILURE, errno, "-"); 1032 1033 return ok ? EXIT_SUCCESS : EXIT_FAILURE; 1034 }