werc-1.5.0-tweaks

Tweaks for the werc website builder created by the mad architect Uriel
Log | Files | Refs | README

urlencode.awk (3733B)


      1 # Taken from http://www.shelldorado.com/scripts/cmds/urlencode
      2 ##########################################################################
      3 # Title      :  urlencode - encode URL data
      4 # Author     :  Heiner Steven (heiner.steven@odn.de)
      5 # Date       :  2000-03-15
      6 # Requires   :  awk
      7 # Categories :  File Conversion, WWW, CGI
      8 # SCCS-Id.   :  @(#) urlencode  1.4 06/10/29
      9 ##########################################################################
     10 # Description
     11 #   Encode data according to
     12 #       RFC 1738: "Uniform Resource Locators (URL)" and
     13 #       RFC 1866: "Hypertext Markup Language - 2.0" (HTML)
     14 #
     15 #   This encoding is used i.e. for the MIME type
     16 #   "application/x-www-form-urlencoded"
     17 #
     18 # Notes
     19 #    o  The default behaviour is not to encode the line endings. This
     20 #   may not be what was intended, because the result will be
     21 #   multiple lines of output (which cannot be used in an URL or a
     22 #   HTTP "POST" request). If the desired output should be one
     23 #   line, use the "-l" option.
     24 #
     25 #    o  The "-l" option assumes, that the end-of-line is denoted by
     26 #   the character LF (ASCII 10). This is not true for Windows or
     27 #   Mac systems, where the end of a line is denoted by the two
     28 #   characters CR LF (ASCII 13 10).
     29 #   We use this for symmetry; data processed in the following way:
     30 #       cat | urlencode -l | urldecode -l
     31 #   should (and will) result in the original data
     32 #
     33 #    o  Large lines (or binary files) will break many AWK
     34 #       implementations. If you get the message
     35 #       awk: record `...' too long
     36 #        record number xxx
     37 #   consider using GNU AWK (gawk).
     38 #
     39 #    o  urlencode will always terminate it's output with an EOL
     40 #       character
     41 #
     42 # Thanks to Stefan Brozinski for pointing out a bug related to non-standard
     43 # locales.
     44 #
     45 # See also
     46 #   urldecode
     47 ##########################################################################
     48 
     49 PN=`basename "$0"`          # Program name
     50 VER='1.4'
     51 
     52 : ${AWK=awk}
     53 
     54 Usage () {
     55     echo >&2 "$PN - encode URL data, $VER
     56 usage: $PN [-l] [file ...]
     57     -l:  encode line endings (result will be one line of output)
     58 
     59 The default is to encode each input line on its own."
     60     exit 1
     61 }
     62 
     63 Msg () {
     64     for MsgLine
     65     do echo "$PN: $MsgLine" >&2
     66     done
     67 }
     68 
     69 Fatal () { Msg "$@"; exit 1; }
     70 
     71 set -- `getopt hl "$@" 2>/dev/null` || Usage
     72 [ $# -lt 1 ] && Usage           # "getopt" detected an error
     73 
     74 EncodeEOL=no
     75 while [ $# -gt 0 ]
     76 do
     77     case "$1" in
     78         -l) EncodeEOL=yes;;
     79     --) shift; break;;
     80     -h) Usage;;
     81     -*) Usage;;
     82     *)  break;;         # First file name
     83     esac
     84     shift
     85 done
     86 
     87 LANG=C  export LANG
     88 $AWK '
     89     BEGIN {
     90     # We assume an awk implementation that is just plain dumb.
     91     # We will convert an character to its ASCII value with the
     92     # table ord[], and produce two-digit hexadecimal output
     93     # without the printf("%02X") feature.
     94 
     95     EOL = "%0A"     # "end of line" string (encoded)
     96     split ("1 2 3 4 5 6 7 8 9 A B C D E F", hextab, " ")
     97     hextab [0] = 0
     98     for ( i=1; i<=255; ++i ) ord [ sprintf ("%c", i) "" ] = i + 0
     99     if ("'"$EncodeEOL"'" == "yes") EncodeEOL = 1; else EncodeEOL = 0
    100     }
    101     {
    102     encoded = ""
    103     for ( i=1; i<=length ($0); ++i ) {
    104         c = substr ($0, i, 1)
    105         if ( c ~ /[a-zA-Z0-9.-]/ ) {
    106         encoded = encoded c     # safe character
    107         } else if ( c == " " ) {
    108         encoded = encoded "+"   # special handling
    109         } else {
    110         # unsafe character, encode it as a two-digit hex-number
    111         lo = ord [c] % 16
    112         hi = int (ord [c] / 16);
    113         encoded = encoded "%" hextab [hi] hextab [lo]
    114         }
    115     }
    116     if ( EncodeEOL ) {
    117         printf ("%s", encoded EOL)
    118     } else {
    119         print encoded
    120     }
    121     }
    122     END {
    123         #if ( EncodeEOL ) print ""
    124     }
    125 ' "$@"
    126