md2html.awk (8731B)
1 #!/bin/awk -f 2 # 3 # by: Jesus Galan (yiyus) 2009 4 # 5 # Usage: md2html.awk file.md > file.html 6 # See: http://4l77.com/src/md2html.awk 7 8 function eschtml(t) { 9 gsub("&", "\\&", t); 10 gsub("<", "\\<", t); 11 return t; 12 } 13 14 function oprint(t){ 15 if(nr == 0) 16 print t; 17 else 18 otext = otext "\n" t; 19 } 20 21 function subref(id){ 22 for(; nr > 0 && sub("<<" id, ref[id], otext); nr--); 23 if(nr == 0 && otext) { 24 print otext; 25 otext = ""; 26 } 27 } 28 29 function nextil(t) { 30 if(!match(t, /[`<&\[*_\\-]|(\!\[)/)) 31 return t; 32 t1 = substr(t, 1, RSTART - 1); 33 tag = substr(t, RSTART, RLENGTH); 34 t2 = substr(t, RSTART + RLENGTH); 35 if(ilcode && tag != "`") 36 return eschtml(t1 tag) nextil(t2); 37 # Backslash escaping 38 if(tag == "\\"){ 39 if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){ 40 tag = substr(t2, 1, 1); 41 t2 = substr(t2, 2); 42 } 43 return t1 tag nextil(t2); 44 } 45 # Dashes 46 if(tag == "-"){ 47 if(sub(/^-/, "", t2)) 48 tag = "—"; 49 return t1 tag nextil(t2); 50 } 51 # Inline Code 52 if(tag == "`"){ 53 if(sub(/^`/, "", t2)){ 54 if(!match(t2, /``/)) 55 return t1 "”" nextil(t2); 56 ilcode2 = !ilcode2; 57 } 58 else if(ilcode2) 59 return t1 tag nextil(t2); 60 tag = "<code>"; 61 if(ilcode){ 62 t1 = eschtml(t1); 63 tag = "</code>"; 64 } 65 ilcode = !ilcode; 66 return t1 tag nextil(t2); 67 } 68 if(tag == "<"){ 69 # Autolinks 70 if(match(t2, /^[^ ]+[\.@][^ ]+>/)){ 71 url = eschtml(substr(t2, 1, RLENGTH - 1)); 72 t2 = substr(t2, RLENGTH + 1); 73 linktext = url; 74 if(match(url, /@/) && !match(url, /^mailto:/)) 75 url = "mailto:" url; 76 return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2); 77 } 78 # Html tags 79 if(match(t2, /^[A-Za-z\/!][^>]*>/)){ 80 tag = tag substr(t2, RSTART, RLENGTH); 81 t2 = substr(t2, RLENGTH + 1); 82 return t1 tag nextil(t2); 83 } 84 return t1 "<" nextil(t2); 85 } 86 # Html special entities 87 if(tag == "&"){ 88 if(match(t2, /^#?[A-Za-z0-9]+;/)){ 89 tag = tag substr(t2, RSTART, RLENGTH); 90 t2 = substr(t2, RLENGTH + 1); 91 return t1 tag nextil(t2); 92 } 93 return t1 "&" nextil(t2); 94 } 95 # Images 96 if(tag == "!["){ 97 if(!match(t2, /(\[.*\])|(\(.*\))/)) 98 return t1 tag nextil(t2); 99 match(t2, /^[^\]]*/); 100 alt = substr(t2, 1, RLENGTH); 101 t2 = substr(t2, RLENGTH + 2); 102 if(match(t2, /^\(/)){ 103 # Inline 104 sub(/^\(/, "", t2); 105 match(t2, /^[^\)]+/); 106 url = eschtml(substr(t2, 1, RLENGTH)); 107 t2 = substr(t2, RLENGTH + 2); 108 title = ""; 109 if(match(url, /[ ]+\".*\"[ ]*$/)) { 110 title = substr(url, RSTART, RLENGTH); 111 url = substr(url, 1, RSTART - 1); 112 match(title, /\".*\"/); 113 title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\""; 114 } 115 if(match(url, /^<.*>$/)) 116 url = substr(url, 2, RLENGTH - 2); 117 return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2); 118 } 119 else{ 120 # Referenced 121 sub(/^ ?\[/, "", t2); 122 id = alt; 123 if(match(t2, /^[^\]]+/)) 124 id = substr(t2, 1, RLENGTH); 125 t2 = substr(t2, RLENGTH + 2); 126 if(ref[id]) 127 r = ref[id]; 128 else{ 129 r = "<<" id; 130 nr++; 131 } 132 return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2); 133 } 134 } 135 # Links 136 if(tag == "["){ 137 if(!match(t2, /(\[.*\])|(\(.*\))/)) 138 return t1 tag nextil(t2); 139 match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/); 140 linktext = substr(t2, 1, RLENGTH); 141 t2 = substr(t2, RLENGTH + 2); 142 if(match(t2, /^\(/)){ 143 # Inline 144 match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/); 145 url = substr(t2, 2, RLENGTH - 1); 146 pt2 = substr(t2, RLENGTH + 2); 147 title = ""; 148 if(match(url, /[ ]+\".*\"[ ]*$/)) { 149 title = substr(url, RSTART, RLENGTH); 150 url = substr(url, 1, RSTART - 1); 151 match(title, /\".*\"/); 152 title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\""; 153 } 154 if(match(url, /^<.*>$/)) 155 url = substr(url, 2, RLENGTH - 2); 156 url = eschtml(url); 157 return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2); 158 } 159 else{ 160 # Referenced 161 sub(/^ ?\[/, "", t2); 162 id = linktext; 163 if(match(t2, /^[^\]]+/)) 164 id = substr(t2, 1, RLENGTH); 165 t2 = substr(t2, RLENGTH + 2); 166 if(ref[id]) 167 r = ref[id]; 168 else{ 169 r = "<<" id; 170 nr++; 171 } 172 pt2 = t2; 173 return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2); 174 } 175 } 176 # Emphasis 177 if(match(tag, /[*_]/)){ 178 ntag = tag; 179 if(sub("^" tag, "", t2)){ 180 if(stag[ns] == tag && match(t2, "^" tag)) 181 t2 = tag t2; 182 else 183 ntag = tag tag 184 } 185 n = length(ntag); 186 tag = (n == 2) ? "strong" : "em"; 187 if(match(t1, / $/) && match(t2, /^ /)) 188 return t1 tag nextil(t2); 189 if(stag[ns] == ntag){ 190 tag = "/" tag; 191 ns--; 192 } 193 else 194 stag[++ns] = ntag; 195 tag = "<" tag ">"; 196 return t1 tag nextil(t2); 197 } 198 } 199 200 function inline(t) { 201 ilcode = 0; 202 ilcode2 = 0; 203 ns = 0; 204 205 return nextil(t); 206 } 207 208 function printp(tag) { 209 if(!match(text, /^[ ]*$/)){ 210 text = inline(text); 211 if(tag != "") 212 oprint("<" tag ">" text "</" tag ">"); 213 else 214 oprint(text); 215 } 216 text = ""; 217 } 218 219 BEGIN { 220 blank = 0; 221 code = 0; 222 hr = 0; 223 html = 0; 224 nl = 0; 225 nr = 0; 226 otext = ""; 227 text = ""; 228 par = "p"; 229 } 230 231 # References 232 !code && /^ *\[[^\]]*\]:[ ]+/ { 233 sub(/^ *\[/, ""); 234 match($0, /\]/); 235 id = substr($0, 1, RSTART - 1); 236 sub(id "\\]:[ ]+", ""); 237 title = ""; 238 if(match($0, /\".*\"$/)) 239 title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2); 240 sub(/[ ]+\".*\"$/, ""); 241 url = eschtml($0); 242 ref[id] = url title; 243 244 subref(id); 245 next; 246 } 247 248 # html 249 !html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ 250 isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ { 251 if(code) 252 oprint("</pre></code>"); 253 for(; !text && block[nl] == "blockquote"; nl--) 254 oprint("</blockquote>"); 255 match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ 256 isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/); 257 htag = substr($0, 2, RLENGTH - 1); 258 if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)")) 259 html = 1; 260 if(html && match($0, /^<hr/)) 261 hr = 1; 262 oprint($0); 263 next; 264 } 265 266 html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\ 267 isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ || 268 (hr && />$/)) { 269 html = 0; 270 hr = 0; 271 oprint($0); 272 next; 273 } 274 275 html { 276 oprint($0); 277 next; 278 } 279 280 # List and quote blocks 281 282 # Remove indentation 283 { 284 for(nnl = 0; nnl < nl; nnl++) 285 if((match(block[nnl + 1], /[ou]l/) && !sub(/^( | )/, "")) || \ 286 (block[nnl + 1] == "blockquote" && !sub(/^> ?/, ""))) 287 break; 288 } 289 nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +| )/ { nnl = nl; } 290 # Quote blocks 291 { 292 while(sub(/^> /, "")) 293 nblock[++nnl] = "blockquote"; 294 } 295 # Horizontal rules 296 { hr = 0; } 297 (blank || (!text && !code)) && /^ ? ? ?([-*_][ ]*)([-*_][ ]*)([-*_][ ]*)+$/ { 298 if(code){ 299 oprint("</pre></code>"); 300 code = 0; 301 } 302 blank = 0; 303 nnl = 0; 304 hr = 1; 305 } 306 # List items 307 block[nl] ~ /[ou]l/ && /^$/ { 308 blank = 1; 309 next; 310 } 311 { newli = 0; } 312 !hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +| )/ { 313 sub(/^ ? ? ?[*+-]( +| )/, ""); 314 nnl++; 315 nblock[nnl] = "ul"; 316 newli = 1; 317 } 318 (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +| )/ { 319 sub(/^ ? ? ?([0-9]+\.)+( +| )/, ""); 320 nnl++; 321 nblock[nnl] = "ol"; 322 newli = 1; 323 } 324 newli { 325 if(blank && nnl == nl && !par) 326 par = "p"; 327 blank = 0; 328 printp(par); 329 if(nnl == nl && block[nl] == nblock[nl]) 330 oprint("</li><li>"); 331 } 332 blank && ! /^$/ { 333 if(match(block[nnl], /[ou]l/) && !par) 334 par = "p"; 335 printp(par); 336 par = "p"; 337 blank = 0; 338 } 339 340 # Close old blocks and open new ones 341 nnl != nl || nblock[nl] != block[nl] { 342 if(code){ 343 oprint("</pre></code>"); 344 code = 0; 345 } 346 printp(par); 347 b = (nnl > nl) ? nblock[nnl] : block[nnl]; 348 par = (match(b, /[ou]l/)) ? "" : "p"; 349 } 350 nnl < nl || (nnl == nl && nblock[nl] != block[nl]) { 351 for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){ 352 if(match(block[nl], /[ou]l/)) 353 oprint("</li>"); 354 oprint("</" block[nl] ">"); 355 } 356 } 357 nnl > nl { 358 for(; nl < nnl; nl++){ 359 block[nl + 1] = nblock[nl + 1]; 360 oprint("<" block[nl + 1] ">"); 361 if(match(block[nl + 1], /[ou]l/)) 362 oprint("<li>"); 363 } 364 } 365 hr { 366 oprint("<hr>"); 367 next; 368 } 369 370 # Code blocks 371 code && /^$/ { 372 if(blanK) 373 oprint(""); 374 blank = 1; 375 next; 376 } 377 !text && sub(/^( | )/, "") { 378 if(blanK) 379 oprint(""); 380 blank = 0; 381 if(!code) 382 oprint("<code><pre>"); 383 code = 1; 384 $0 = eschtml($0); 385 oprint($0); 386 next; 387 } 388 code { 389 oprint("</pre></code>"); 390 code = 0; 391 } 392 393 # Setex-style Headers 394 text && /^=+$/ {printp("h1"); next;} 395 text && /^-+$/ {printp("h2"); next;} 396 397 # Atx-Style headers 398 /^#+/ && (!newli || par=="p" || /^##/) { 399 for(n = 0; n < 6 && sub(/^# */, ""); n++) 400 sub(/#$/, ""); 401 par = "h" n; 402 } 403 404 # Paragraph 405 /^$/ { 406 printp(par); 407 par = "p"; 408 next; 409 } 410 411 # Add text 412 { text = (text ? text " " : "") $0; } 413 414 END { 415 if(code){ 416 oprint("</pre></code>"); 417 code = 0; 418 } 419 printp(par); 420 for(; nl > 0; nl--){ 421 if(match(block[nl], /[ou]l/)) 422 oprint("</li>"); 423 oprint("</" block[nl] ">"); 424 } 425 gsub(/<<[^\"]*/, "", otext); 426 print(otext); 427 }