werc-1.5.0-tweaks

Tweaks for the werc website builder created by the mad architect Uriel
Log | Files | Refs | README

md2html.awk (8731B)


      1 #!/bin/awk -f
      2 #
      3 # by: Jesus Galan (yiyus) 2009
      4 #
      5 # Usage: md2html.awk file.md > file.html
      6 # See: http://4l77.com/src/md2html.awk
      7 
      8 function eschtml(t) {
      9 	gsub("&", "\\&", t);
     10 	gsub("<", "\\&lt;", t);
     11 	return t;
     12 }
     13 
     14 function oprint(t){
     15 	if(nr == 0)
     16 		print t;
     17 	else
     18 		otext = otext "\n" t;
     19 }
     20 
     21 function subref(id){
     22 	for(; nr > 0 && sub("<<" id, ref[id], otext); nr--);
     23 	if(nr == 0 && otext) {
     24 		print otext;
     25 		otext = "";
     26 	}
     27 }
     28 
     29 function nextil(t) {
     30 	if(!match(t, /[`<&\[*_\\-]|(\!\[)/))
     31 		return t;
     32 	t1 = substr(t, 1, RSTART - 1);
     33 	tag = substr(t, RSTART, RLENGTH);
     34 	t2 = substr(t, RSTART + RLENGTH);
     35 	if(ilcode && tag != "`")
     36 		return eschtml(t1 tag) nextil(t2);
     37 	# Backslash escaping
     38 	if(tag == "\\"){
     39 		if(match(t2, /^[\\`*_{}\[\]()#+\-\.!]/)){
     40 			tag = substr(t2, 1, 1);
     41 			t2 = substr(t2, 2);
     42 		}
     43 		return t1 tag nextil(t2);
     44 	}
     45 	# Dashes
     46 	if(tag == "-"){
     47 		if(sub(/^-/, "", t2))
     48 			tag = "&#8212;";
     49 		return t1 tag nextil(t2);
     50 	}
     51 	# Inline Code
     52 	if(tag == "`"){
     53 		if(sub(/^`/, "", t2)){
     54 			if(!match(t2, /``/))
     55 				return t1 "&#8221;" nextil(t2);
     56 			ilcode2 = !ilcode2;
     57 		}
     58 		else if(ilcode2)
     59 			return t1 tag nextil(t2);
     60 		tag = "<code>";
     61 		if(ilcode){
     62 			t1 = eschtml(t1);
     63 			tag = "</code>";
     64 		}
     65 		ilcode = !ilcode;
     66 		return t1 tag nextil(t2);
     67 	}
     68 	if(tag == "<"){
     69 	# Autolinks
     70 		if(match(t2, /^[^ 	]+[\.@][^ 	]+>/)){
     71 			url = eschtml(substr(t2, 1, RLENGTH - 1));
     72 			t2 = substr(t2, RLENGTH + 1);
     73 			linktext = url;
     74 			if(match(url, /@/) && !match(url, /^mailto:/))
     75 				url = "mailto:" url;
     76 			return t1 "<a href=\"" url "\">" linktext "</a>" nextil(t2);
     77 		}
     78 	# Html tags
     79 		if(match(t2, /^[A-Za-z\/!][^>]*>/)){
     80 			tag = tag substr(t2, RSTART, RLENGTH);
     81 			t2 = substr(t2, RLENGTH + 1);
     82 			return t1 tag nextil(t2);
     83 		}
     84 		return t1 "&lt;" nextil(t2);
     85 	}
     86 	# Html special entities
     87 	if(tag == "&"){
     88 		if(match(t2, /^#?[A-Za-z0-9]+;/)){
     89 			tag = tag substr(t2, RSTART, RLENGTH);
     90 			t2 = substr(t2, RLENGTH + 1);
     91 			return t1 tag nextil(t2);
     92 		}
     93 		return t1 "&amp;" nextil(t2);
     94 	}
     95 	# Images
     96 	if(tag == "!["){
     97 		if(!match(t2, /(\[.*\])|(\(.*\))/))
     98 			return t1 tag nextil(t2);
     99 		match(t2, /^[^\]]*/);
    100 		alt = substr(t2, 1, RLENGTH);
    101 		t2 = substr(t2, RLENGTH + 2);
    102 		if(match(t2, /^\(/)){
    103 			# Inline
    104 			sub(/^\(/, "", t2);
    105 			match(t2, /^[^\)]+/);
    106 			url = eschtml(substr(t2, 1, RLENGTH));
    107 			t2 = substr(t2, RLENGTH + 2);
    108 			title = "";
    109 			if(match(url, /[ 	]+\".*\"[ 	]*$/)) {
    110 				title = substr(url, RSTART, RLENGTH);
    111 				url = substr(url, 1, RSTART - 1);
    112 				match(title, /\".*\"/);
    113 				title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
    114 			}
    115 			if(match(url, /^<.*>$/))
    116 				url = substr(url, 2, RLENGTH - 2);
    117 			return t1 "<img src=\"" url "\" alt=\"" alt "\"" title " />" nextil(t2);
    118 		}
    119 		else{
    120 			# Referenced
    121 			sub(/^ ?\[/, "", t2);
    122 			id = alt;
    123 			if(match(t2, /^[^\]]+/))
    124 				id = substr(t2, 1, RLENGTH);
    125 			t2 = substr(t2, RLENGTH + 2);
    126 			if(ref[id])
    127 				r = ref[id];
    128 			else{
    129 				r = "<<" id;
    130 				nr++;
    131 			}
    132 			return t1 "<img src=\"" r "\" alt=\"" alt "\" />" nextil(t2);
    133 		}
    134 	}
    135 	# Links
    136 	if(tag == "["){
    137 		if(!match(t2, /(\[.*\])|(\(.*\))/))
    138 			return t1 tag nextil(t2);
    139 		match(t2, /^[^\]]*(\[[^\]]*\][^\]]*)*/);
    140 		linktext = substr(t2, 1, RLENGTH);
    141 		t2 = substr(t2, RLENGTH + 2);
    142 		if(match(t2, /^\(/)){
    143 			# Inline
    144 			match(t2, /^[^\)]+(\([^\)]+\)[^\)]*)*/);
    145 			url = substr(t2, 2, RLENGTH - 1);
    146 			pt2 = substr(t2, RLENGTH + 2);
    147 			title = "";
    148 			if(match(url, /[ 	]+\".*\"[ 	]*$/)) {
    149 				title = substr(url, RSTART, RLENGTH);
    150 				url = substr(url, 1, RSTART - 1);
    151 				match(title, /\".*\"/);
    152 				title = " title=\"" substr(title, RSTART + 1, RLENGTH - 2) "\"";
    153 			}
    154 			if(match(url, /^<.*>$/))
    155 				url = substr(url, 2, RLENGTH - 2);
    156 			url = eschtml(url);
    157 			return t1 "<a href=\"" url "\"" title ">" nextil(linktext) "</a>" nextil(pt2);
    158 		}
    159 		else{
    160 			# Referenced
    161 			sub(/^ ?\[/, "", t2);
    162 			id = linktext;
    163 			if(match(t2, /^[^\]]+/))
    164 				id = substr(t2, 1, RLENGTH);
    165 			t2 = substr(t2, RLENGTH + 2);
    166 			if(ref[id])
    167 				r = ref[id];
    168 			else{
    169 				r = "<<" id;
    170 				nr++;
    171 			}
    172 			pt2 = t2;
    173 			return t1 "<a href=\"" r "\" />" nextil(linktext) "</a>" nextil(pt2);
    174 		}
    175 	}
    176 	# Emphasis
    177 	if(match(tag, /[*_]/)){
    178 		ntag = tag;
    179 		if(sub("^" tag, "", t2)){
    180 			if(stag[ns] == tag && match(t2, "^" tag))
    181 				t2 = tag t2;
    182 			else
    183 				ntag = tag tag
    184 		}
    185 		n = length(ntag);
    186 		tag = (n == 2) ? "strong" : "em";
    187 		if(match(t1, / $/) && match(t2, /^ /))
    188 			return t1 tag nextil(t2);
    189 		if(stag[ns] == ntag){
    190 			tag = "/" tag;
    191 			ns--;
    192 		}
    193 		else
    194 			stag[++ns] = ntag;
    195 		tag = "<" tag ">";
    196 		return t1 tag nextil(t2);
    197 	}
    198 }
    199 
    200 function inline(t) {
    201 	ilcode = 0;
    202 	ilcode2 = 0;
    203 	ns = 0;
    204 	
    205 	return nextil(t);
    206 }
    207 
    208 function printp(tag) {
    209 	if(!match(text, /^[ 	]*$/)){
    210 		text = inline(text);
    211 		if(tag != "")
    212 			oprint("<" tag ">" text "</" tag ">");
    213 		else
    214 			oprint(text);
    215 	}
    216 	text = "";
    217 }
    218 
    219 BEGIN {
    220 	blank = 0;
    221 	code = 0;
    222 	hr = 0;
    223 	html = 0;
    224 	nl = 0;
    225 	nr = 0;
    226 	otext = "";
    227 	text = "";
    228 	par = "p";
    229 }
    230 
    231 # References
    232 !code && /^ *\[[^\]]*\]:[ 	]+/ {
    233 	sub(/^ *\[/, "");
    234 	match($0, /\]/);
    235 	id = substr($0, 1, RSTART - 1);
    236 	sub(id "\\]:[ 	]+", "");
    237 	title = "";
    238 	if(match($0, /\".*\"$/))
    239 		title = "\" title=\"" substr($0, RSTART + 1, RLENGTH - 2);
    240 	sub(/[ 	]+\".*\"$/, "");
    241 	url = eschtml($0);
    242 	ref[id] = url title;
    243 
    244 	subref(id);
    245 	next;
    246 }
    247 
    248 # html
    249 !html && /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
    250 isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/ {
    251 	if(code)
    252 		oprint("</pre></code>");
    253 	for(; !text && block[nl] == "blockquote"; nl--)
    254 		oprint("</blockquote>");
    255 	match($0, /^<(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
    256 	isindex|menu|noframes|noscript|ol|p|pre|table|ul|!--)/);
    257 	htag = substr($0, 2, RLENGTH - 1);
    258 	if(!match($0, "(<\\/" htag ">)|((^<hr ?\\/?)|(--)>$)"))
    259 		html = 1;
    260 	if(html && match($0, /^<hr/))
    261 		hr = 1;
    262 	oprint($0);
    263 	next;
    264 }
    265 
    266 html && (/(^<\/(address|blockquote|center|dir|div|dl|fieldset|form|h[1-6r]|\
    267 isindex|menu|noframes|noscript|ol|p|pre|table|ul).*)|(--)>$/ ||
    268 (hr && />$/)) {
    269 	html = 0;
    270 	hr = 0;
    271 	oprint($0);
    272 	next;
    273 }
    274 
    275 html {
    276 	oprint($0);
    277 	next;
    278 }
    279 
    280 # List and quote blocks
    281 
    282 #   Remove indentation
    283 {
    284 	for(nnl = 0; nnl < nl; nnl++)
    285 		if((match(block[nnl + 1], /[ou]l/) && !sub(/^(    |	)/, "")) || \
    286 		(block[nnl + 1] == "blockquote" && !sub(/^> ?/, "")))
    287 			break;
    288 }
    289 nnl < nl && !blank && text && ! /^ ? ? ?([*+-]|([0-9]+\.)+)( +|	)/ { nnl = nl; }
    290 #   Quote blocks
    291 { 
    292 	while(sub(/^> /, ""))
    293 		nblock[++nnl] = "blockquote";
    294 }
    295 #   Horizontal rules
    296 { hr = 0; }
    297 (blank || (!text && !code)) && /^ ? ? ?([-*_][ 	]*)([-*_][ 	]*)([-*_][ 	]*)+$/ {
    298 	if(code){
    299 		oprint("</pre></code>");
    300 		code = 0;
    301 	}
    302 	blank = 0;
    303 	nnl = 0;
    304 	hr = 1;
    305 }
    306 #   List items
    307 block[nl] ~ /[ou]l/ && /^$/ {
    308 	blank = 1;
    309 	next;
    310 }
    311 { newli = 0; }
    312 !hr && (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?[*+-]( +|	)/ {
    313 	sub(/^ ? ? ?[*+-]( +|	)/, "");
    314 	nnl++;
    315 	nblock[nnl] = "ul";
    316 	newli = 1;
    317 }
    318 (nnl != nl || !text || block[nl] ~ /[ou]l/) && /^ ? ? ?([0-9]+\.)+( +|	)/ {
    319 	sub(/^ ? ? ?([0-9]+\.)+( +|	)/, "");
    320 	nnl++;
    321 	nblock[nnl] = "ol";
    322 	newli = 1;
    323 }
    324 newli { 
    325 	if(blank && nnl == nl && !par)
    326 		par = "p";
    327 	blank = 0;
    328 	printp(par);
    329 	if(nnl == nl && block[nl] == nblock[nl])
    330 		oprint("</li><li>");
    331 }
    332 blank && ! /^$/ {
    333 	if(match(block[nnl], /[ou]l/) && !par)
    334 		par = "p";
    335 	printp(par);
    336 	par = "p";
    337 	blank = 0;
    338 }
    339 		
    340 # Close old blocks and open new ones
    341 nnl != nl || nblock[nl] != block[nl] {
    342 	if(code){
    343 		oprint("</pre></code>");
    344 		code = 0;
    345 	}
    346 	printp(par);
    347 	b = (nnl > nl) ? nblock[nnl] : block[nnl];
    348 	par = (match(b, /[ou]l/)) ? "" : "p";
    349 }
    350 nnl < nl || (nnl == nl && nblock[nl] != block[nl]) {
    351 	for(; nl > nnl || (nnl == nl && pblock[nl] != block[nl]); nl--){
    352 		if(match(block[nl], /[ou]l/))
    353 			oprint("</li>");
    354 		oprint("</" block[nl] ">");
    355 	}
    356 }
    357 nnl > nl {
    358 	for(; nl < nnl; nl++){
    359 		block[nl + 1] = nblock[nl + 1];
    360 		oprint("<" block[nl + 1] ">");
    361 		if(match(block[nl + 1], /[ou]l/))
    362 			oprint("<li>");
    363 	}
    364 }
    365 hr {
    366 	oprint("<hr>");
    367 	next;
    368 }
    369 
    370 # Code blocks
    371 code && /^$/ { 
    372 	if(blanK)
    373 		oprint("");
    374 	blank = 1;
    375 	next;
    376 }
    377 !text && sub(/^(	|    )/, "") {
    378 	if(blanK)
    379 		oprint("");
    380 	blank = 0;
    381 	if(!code)
    382 		oprint("<code><pre>");
    383 	code = 1;
    384 	$0 = eschtml($0);
    385 	oprint($0);
    386 	next;
    387 }
    388 code {
    389 	oprint("</pre></code>");
    390 	code = 0;
    391 }
    392 
    393 # Setex-style Headers
    394 text && /^=+$/ {printp("h1"); next;}
    395 text && /^-+$/ {printp("h2"); next;} 
    396 
    397 # Atx-Style headers
    398 /^#+/ && (!newli || par=="p" || /^##/) {
    399 	for(n = 0; n < 6 && sub(/^# */, ""); n++)
    400 		sub(/#$/, "");
    401 	par = "h" n;
    402 }
    403 
    404 # Paragraph	
    405 /^$/ {
    406 	printp(par);
    407 	par = "p";
    408 	next;
    409 }
    410 
    411 # Add text
    412 { text = (text ? text " " : "") $0; }
    413 
    414 END {
    415 	if(code){
    416 		oprint("</pre></code>");
    417 		code = 0;
    418 	}
    419 	printp(par);
    420 	for(; nl > 0; nl--){
    421 		if(match(block[nl], /[ou]l/))
    422 			oprint("</li>");
    423 		oprint("</" block[nl] ">");
    424 	}
    425 	gsub(/<<[^\"]*/, "", otext);
    426 	print(otext);
    427 }