busybox-wc.c (6530B)
1 /* vi: set sw=4 ts=4: */ 2 /* 3 * wc implementation for busybox 4 * 5 * Copyright (C) 2003 Manuel Novoa III <mjn3@codepoet.org> 6 * 7 * Licensed under GPLv2 or later, see file LICENSE in this source tree. 8 */ 9 /* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org) 10 * 11 * Rewritten to fix a number of problems and do some size optimizations. 12 * Problems in the previous busybox implementation (besides bloat) included: 13 * 1) broken 'wc -c' optimization (read note below) 14 * 2) broken handling of '-' args 15 * 3) no checking of ferror on EOF returns 16 * 4) isprint() wasn't considered when word counting. 17 * 18 * NOTES: 19 * 20 * The previous busybox wc attempted an optimization using stat for the 21 * case of counting chars only. I omitted that because it was broken. 22 * It didn't take into account the possibility of input coming from a 23 * pipe, or input from a file with file pointer not at the beginning. 24 * 25 * To implement such a speed optimization correctly, not only do you 26 * need the size, but also the file position. Note also that the 27 * file position may be past the end of file. Consider the example 28 * (adapted from example in gnu wc.c) 29 * 30 * echo hello > /tmp/testfile && 31 * (dd ibs=1k skip=1 count=0 &> /dev/null; wc -c) < /tmp/testfile 32 * 33 * for which 'wc -c' should output '0'. 34 */ 35 //config:config WC 36 //config: bool "wc (4.7 kb)" 37 //config: default y 38 //config: help 39 //config: wc is used to print the number of bytes, words, and lines, 40 //config: in specified files. 41 //config: 42 //config:config FEATURE_WC_LARGE 43 //config: bool "Support very large counts" 44 //config: default y 45 //config: depends on WC 46 //config: help 47 //config: Use "unsigned long long" for counter variables. 48 49 //applet:IF_WC(APPLET(wc, BB_DIR_USR_BIN, BB_SUID_DROP)) 50 51 //kbuild:lib-$(CONFIG_WC) += wc.o 52 53 /* BB_AUDIT SUSv3 compliant. */ 54 /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */ 55 56 #include "libbb.h" 57 #include "unicode.h" 58 59 #if !ENABLE_LOCALE_SUPPORT 60 # undef isprint 61 # undef isspace 62 # define isprint(c) ((unsigned)((c) - 0x20) <= (0x7e - 0x20)) 63 # define isspace(c) ((c) == ' ') 64 #endif 65 66 #if ENABLE_FEATURE_WC_LARGE 67 # define COUNT_T unsigned long long 68 # define COUNT_FMT "llu" 69 #else 70 # define COUNT_T unsigned 71 # define COUNT_FMT "u" 72 #endif 73 74 /* We support -m even when UNICODE_SUPPORT is off, 75 * we just don't advertise it in help text, 76 * since it is the same as -c in this case. 77 */ 78 79 //usage:#define wc_trivial_usage 80 //usage: "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..." 81 //usage: 82 //usage:#define wc_full_usage "\n\n" 83 //usage: "Count lines, words, and bytes for FILEs (or stdin)\n" 84 //usage: "\n -c Count bytes" 85 //usage: IF_UNICODE_SUPPORT( 86 //usage: "\n -m Count characters" 87 //usage: ) 88 //usage: "\n -l Count newlines" 89 //usage: "\n -w Count words" 90 //usage: "\n -L Print longest line length" 91 //usage: 92 //usage:#define wc_example_usage 93 //usage: "$ wc /etc/passwd\n" 94 //usage: " 31 46 1365 /etc/passwd\n" 95 96 /* Order is important if we want to be compatible with 97 * column order in "wc -cmlwL" output: 98 */ 99 enum { 100 WC_LINES = 0, /* -l */ 101 WC_WORDS = 1, /* -w */ 102 WC_UNICHARS = 2, /* -m */ 103 WC_BYTES = 3, /* -c */ 104 WC_LENGTH = 4, /* -L */ 105 NUM_WCS = 5, 106 }; 107 108 int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; 109 int wc_main(int argc UNUSED_PARAM, char **argv) 110 { 111 const char *arg; 112 const char *start_fmt = " %9"COUNT_FMT + 1; 113 const char *fname_fmt = " %s\n"; 114 COUNT_T *pcounts; 115 COUNT_T counts[NUM_WCS]; 116 COUNT_T totals[NUM_WCS]; 117 int num_files; 118 smallint status = EXIT_SUCCESS; 119 unsigned print_type; 120 121 init_unicode(); 122 123 print_type = getopt32(argv, "lwmcL"); 124 125 if (print_type == 0) { 126 print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_BYTES); 127 } 128 129 argv += optind; 130 if (!argv[0]) { 131 *--argv = (char *) bb_msg_standard_input; 132 fname_fmt = "\n"; 133 } 134 if (!argv[1]) { /* zero or one filename? */ 135 if (!((print_type-1) & print_type)) /* exactly one option? */ 136 start_fmt = "%"COUNT_FMT; 137 } 138 139 memset(totals, 0, sizeof(totals)); 140 141 pcounts = counts; 142 143 num_files = 0; 144 while ((arg = *argv++) != NULL) { 145 FILE *fp; 146 const char *s; 147 unsigned u; 148 unsigned linepos; 149 smallint in_word; 150 151 ++num_files; 152 fp = fopen_or_warn_stdin(arg); 153 if (!fp) { 154 status = EXIT_FAILURE; 155 continue; 156 } 157 158 memset(counts, 0, sizeof(counts)); 159 linepos = 0; 160 in_word = 0; 161 162 while (1) { 163 int c; 164 /* Our -w doesn't match GNU wc exactly... oh well */ 165 166 c = getc(fp); 167 if (c == EOF) { 168 if (ferror(fp)) { 169 bb_simple_perror_msg(arg); 170 status = EXIT_FAILURE; 171 } 172 goto DO_EOF; /* Treat an EOF as '\r'. */ 173 } 174 175 /* Cater for -c and -m */ 176 ++counts[WC_BYTES]; 177 if (unicode_status != UNICODE_ON /* every byte is a new char */ 178 || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */ 179 ) { 180 ++counts[WC_UNICHARS]; 181 } 182 183 if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */ 184 ++linepos; 185 if (!isspace(c)) { 186 in_word = 1; 187 continue; 188 } 189 } else if ((unsigned)(c - 9) <= 4) { 190 /* \t 9 191 * \n 10 192 * \v 11 193 * \f 12 194 * \r 13 195 */ 196 if (c == '\t') { 197 linepos = (linepos | 7) + 1; 198 } else { /* '\n', '\r', '\f', or '\v' */ 199 DO_EOF: 200 if (linepos > counts[WC_LENGTH]) { 201 counts[WC_LENGTH] = linepos; 202 } 203 if (c == '\n') { 204 ++counts[WC_LINES]; 205 } 206 if (c != '\v') { 207 linepos = 0; 208 } 209 } 210 } else { 211 continue; 212 } 213 214 counts[WC_WORDS] += in_word; 215 in_word = 0; 216 if (c == EOF) { 217 break; 218 } 219 } 220 221 fclose_if_not_stdin(fp); 222 223 if (totals[WC_LENGTH] < counts[WC_LENGTH]) { 224 totals[WC_LENGTH] = counts[WC_LENGTH]; 225 } 226 totals[WC_LENGTH] -= counts[WC_LENGTH]; 227 228 OUTPUT: 229 /* coreutils wc tries hard to print pretty columns 230 * (saves results for all files, finds max col len etc...) 231 * we won't try that hard, it will bloat us too much */ 232 s = start_fmt; 233 u = 0; 234 do { 235 if (print_type & (1 << u)) { 236 printf(s, pcounts[u]); 237 s = " %9"COUNT_FMT; /* Ok... restore the leading space. */ 238 } 239 totals[u] += pcounts[u]; 240 } while (++u < NUM_WCS); 241 printf(fname_fmt, arg); 242 } 243 244 /* If more than one file was processed, we want the totals. To save some 245 * space, we set the pcounts ptr to the totals array. This has the side 246 * effect of trashing the totals array after outputting it, but that's 247 * irrelavent since we no longer need it. */ 248 if (num_files > 1) { 249 num_files = 0; /* Make sure we don't get here again. */ 250 arg = "total"; 251 pcounts = totals; 252 --argv; 253 goto OUTPUT; 254 } 255 256 fflush_stdout_and_exit(status); 257 }