markdown.pl (35623B)
1 #!/usr/bin/env perl 2 # 3 # Markdown -- A text-to-HTML conversion tool for web writers 4 # 5 # Copyright (c) 2004 John Gruber 6 # <http://daringfireball.net/projects/markdown/> 7 # 8 package Markdown; 9 require 5.006_000; 10 use strict; 11 use warnings; 12 13 use Digest::MD5 qw(md5_hex); 14 use vars qw($VERSION); 15 $VERSION = '1.0.1'; 16 # Tue 14 Dec 2004 17 18 ## Disabled; causes problems under Perl 5.6.1: 19 # use utf8; 20 # binmode( STDOUT, ":utf8" ); # c.f.: http://acis.openlib.org/dev/perl-unicode-struggle.html 21 22 23 # 24 # Global default settings: 25 # 26 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output 27 my $g_tab_width = 4; 28 29 30 # 31 # Globals: 32 # 33 34 # Regex to match balanced [brackets]. See Friedl's 35 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331. 36 my $g_nested_brackets; 37 $g_nested_brackets = qr{ 38 (?> # Atomic matching 39 [^\[\]]+ # Anything other than brackets 40 | 41 \[ 42 (??{ $g_nested_brackets }) # Recursive set of nested brackets 43 \] 44 )* 45 }x; 46 47 48 # Table of hash values for escaped characters: 49 my %g_escape_table; 50 foreach my $char (split //, '\\`*_{}[]()>#+-.!') { 51 $g_escape_table{$char} = md5_hex($char); 52 } 53 54 55 # Global hashes, used by various utility routines 56 my %g_urls; 57 my %g_titles; 58 my %g_html_blocks; 59 60 # Used to track when we're inside an ordered or unordered list 61 # (see _ProcessListItems() for details): 62 my $g_list_level = 0; 63 64 65 #### Blosxom plug-in interface ########################################## 66 67 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine 68 # which posts Markdown should process, using a "meta-markup: markdown" 69 # header. If it's set to 0 (the default), Markdown will process all 70 # entries. 71 my $g_blosxom_use_meta = 0; 72 73 sub start { 1; } 74 sub story { 75 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_; 76 77 if ( (! $g_blosxom_use_meta) or 78 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i)) 79 ){ 80 $$body_ref = Markdown($$body_ref); 81 } 82 1; 83 } 84 85 86 #### Movable Type plug-in interface ##################################### 87 eval {require MT}; # Test to see if we're running in MT. 88 unless ($@) { 89 require MT; 90 import MT; 91 require MT::Template::Context; 92 import MT::Template::Context; 93 94 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0. 95 unless ($@) { 96 require MT::Plugin; 97 import MT::Plugin; 98 my $plugin = new MT::Plugin({ 99 name => "Markdown", 100 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)", 101 doc_link => 'http://daringfireball.net/projects/markdown/' 102 }); 103 MT->add_plugin( $plugin ); 104 } 105 106 MT::Template::Context->add_container_tag(MarkdownOptions => sub { 107 my $ctx = shift; 108 my $args = shift; 109 my $builder = $ctx->stash('builder'); 110 my $tokens = $ctx->stash('tokens'); 111 112 if (defined ($args->{'output'}) ) { 113 $ctx->stash('markdown_output', lc $args->{'output'}); 114 } 115 116 defined (my $str = $builder->build($ctx, $tokens) ) 117 or return $ctx->error($builder->errstr); 118 $str; # return value 119 }); 120 121 MT->add_text_filter('markdown' => { 122 label => 'Markdown', 123 docs => 'http://daringfireball.net/projects/markdown/', 124 on_format => sub { 125 my $text = shift; 126 my $ctx = shift; 127 my $raw = 0; 128 if (defined $ctx) { 129 my $output = $ctx->stash('markdown_output'); 130 if (defined $output && $output =~ m/^html/i) { 131 $g_empty_element_suffix = ">"; 132 $ctx->stash('markdown_output', ''); 133 } 134 elsif (defined $output && $output eq 'raw') { 135 $raw = 1; 136 $ctx->stash('markdown_output', ''); 137 } 138 else { 139 $raw = 0; 140 $g_empty_element_suffix = " />"; 141 } 142 } 143 $text = $raw ? $text : Markdown($text); 144 $text; 145 }, 146 }); 147 148 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter: 149 my $smartypants; 150 151 { 152 no warnings "once"; 153 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'}; 154 } 155 156 if ($smartypants) { 157 MT->add_text_filter('markdown_with_smartypants' => { 158 label => 'Markdown With SmartyPants', 159 docs => 'http://daringfireball.net/projects/markdown/', 160 on_format => sub { 161 my $text = shift; 162 my $ctx = shift; 163 if (defined $ctx) { 164 my $output = $ctx->stash('markdown_output'); 165 if (defined $output && $output eq 'html') { 166 $g_empty_element_suffix = ">"; 167 } 168 else { 169 $g_empty_element_suffix = " />"; 170 } 171 } 172 $text = Markdown($text); 173 $text = $smartypants->($text, '1'); 174 }, 175 }); 176 } 177 } 178 else { 179 #### BBEdit/command-line text filter interface ########################## 180 # Needs to be hidden from MT (and Blosxom when running in static mode). 181 182 # We're only using $blosxom::version once; tell Perl not to warn us: 183 no warnings 'once'; 184 unless ( defined($blosxom::version) ) { 185 use warnings; 186 187 #### Check for command-line switches: ################# 188 my %cli_opts; 189 use Getopt::Long; 190 Getopt::Long::Configure('pass_through'); 191 GetOptions(\%cli_opts, 192 'version', 193 'shortversion', 194 'html4tags', 195 ); 196 if ($cli_opts{'version'}) { # Version info 197 print "\nThis is Markdown, version $VERSION.\n"; 198 print "Copyright 2004 John Gruber\n"; 199 print "http://daringfireball.net/projects/markdown/\n\n"; 200 exit 0; 201 } 202 if ($cli_opts{'shortversion'}) { # Just the version number string. 203 print $VERSION; 204 exit 0; 205 } 206 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML 207 $g_empty_element_suffix = ">"; 208 } 209 210 211 #### Process incoming text: ########################### 212 my $text; 213 { 214 local $/; # Slurp the whole file 215 $text = <>; 216 } 217 print Markdown($text); 218 } 219 } 220 221 222 223 sub Markdown { 224 # 225 # Main function. The order in which other subs are called here is 226 # essential. Link and image substitutions need to happen before 227 # _EscapeSpecialChars(), so that any *'s or _'s in the <a> 228 # and <img> tags get encoded. 229 # 230 my $text = shift; 231 232 # Clear the global hashes. If we don't clear these, you get conflicts 233 # from other articles when generating a page which contains more than 234 # one article (e.g. an index page that shows the N most recent 235 # articles): 236 %g_urls = (); 237 %g_titles = (); 238 %g_html_blocks = (); 239 240 241 # Standardize line endings: 242 $text =~ s{\r\n}{\n}g; # DOS to Unix 243 $text =~ s{\r}{\n}g; # Mac to Unix 244 245 # Make sure $text ends with a couple of newlines: 246 $text .= "\n\n"; 247 248 # Convert all tabs to spaces. 249 $text = _Detab($text); 250 251 # Strip any lines consisting only of spaces and tabs. 252 # This makes subsequent regexen easier to write, because we can 253 # match consecutive blank lines with /\n+/ instead of something 254 # contorted like /[ \t]*\n+/ . 255 $text =~ s/^[ \t]+$//mg; 256 257 # Turn block-level HTML blocks into hash entries 258 $text = _HashHTMLBlocks($text); 259 260 # Strip link definitions, store in hashes. 261 $text = _StripLinkDefinitions($text); 262 263 $text = _RunBlockGamut($text); 264 265 $text = _UnescapeSpecialChars($text); 266 267 return $text . "\n"; 268 } 269 270 271 sub _StripLinkDefinitions { 272 # 273 # Strips link definitions from text, stores the URLs and titles in 274 # hash references. 275 # 276 my $text = shift; 277 my $less_than_tab = $g_tab_width - 1; 278 279 # Link defs are in the form: ^[id]: url "optional title" 280 while ($text =~ s{ 281 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1 282 [ \t]* 283 \n? # maybe *one* newline 284 [ \t]* 285 <?(\S+?)>? # url = $2 286 [ \t]* 287 \n? # maybe one newline 288 [ \t]* 289 (?: 290 (?<=\s) # lookbehind for whitespace 291 ["(] 292 (.+?) # title = $3 293 [")] 294 [ \t]* 295 )? # title is optional 296 (?:\n+|\Z) 297 } 298 {}mx) { 299 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive 300 if ($3) { 301 $g_titles{lc $1} = $3; 302 $g_titles{lc $1} =~ s/"/"/g; 303 } 304 } 305 306 return $text; 307 } 308 309 310 sub _HashHTMLBlocks { 311 my $text = shift; 312 my $less_than_tab = $g_tab_width - 1; 313 314 # Hashify HTML blocks: 315 # We only want to do this for block-level HTML tags, such as headers, 316 # lists, and tables. That's because we still want to wrap <p>s around 317 # "paragraphs" that are wrapped in non-block-level tags, such as anchors, 318 # phrase emphasis, and spans. The list of tags we're looking for is 319 # hard-coded: 320 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/; 321 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/; 322 323 # First, look for nested blocks, e.g.: 324 # <div> 325 # <div> 326 # tags for inner block must be indented. 327 # </div> 328 # </div> 329 # 330 # The outermost tags must start at the left margin for this to match, and 331 # the inner nested divs must be indented. 332 # We need to do this before the next, more liberal match, because the next 333 # match will start at the first `<div>` and stop at the first `</div>`. 334 $text =~ s{ 335 ( # save in $1 336 ^ # start of line (with /m) 337 <($block_tags_a) # start tag = $2 338 \b # word break 339 (.*\n)*? # any number of lines, minimally matching 340 </\2> # the matching end tag 341 [ \t]* # trailing spaces/tabs 342 (?=\n+|\Z) # followed by a newline or end of document 343 ) 344 }{ 345 my $key = md5_hex($1); 346 $g_html_blocks{$key} = $1; 347 "\n\n" . $key . "\n\n"; 348 }egmx; 349 350 351 # 352 # Now match more liberally, simply from `\n<tag>` to `</tag>\n` 353 # 354 $text =~ s{ 355 ( # save in $1 356 ^ # start of line (with /m) 357 <($block_tags_b) # start tag = $2 358 \b # word break 359 (.*\n)*? # any number of lines, minimally matching 360 .*</\2> # the matching end tag 361 [ \t]* # trailing spaces/tabs 362 (?=\n+|\Z) # followed by a newline or end of document 363 ) 364 }{ 365 my $key = md5_hex($1); 366 $g_html_blocks{$key} = $1; 367 "\n\n" . $key . "\n\n"; 368 }egmx; 369 # Special case just for <hr />. It was easier to make a special case than 370 # to make the other regex more complicated. 371 $text =~ s{ 372 (?: 373 (?<=\n\n) # Starting after a blank line 374 | # or 375 \A\n? # the beginning of the doc 376 ) 377 ( # save in $1 378 [ ]{0,$less_than_tab} 379 <(hr) # start tag = $2 380 \b # word break 381 ([^<>])*? # 382 /?> # the matching end tag 383 [ \t]* 384 (?=\n{2,}|\Z) # followed by a blank line or end of document 385 ) 386 }{ 387 my $key = md5_hex($1); 388 $g_html_blocks{$key} = $1; 389 "\n\n" . $key . "\n\n"; 390 }egx; 391 392 # Special case for standalone HTML comments: 393 $text =~ s{ 394 (?: 395 (?<=\n\n) # Starting after a blank line 396 | # or 397 \A\n? # the beginning of the doc 398 ) 399 ( # save in $1 400 [ ]{0,$less_than_tab} 401 (?s: 402 <! 403 (--.*?--\s*)+ 404 > 405 ) 406 [ \t]* 407 (?=\n{2,}|\Z) # followed by a blank line or end of document 408 ) 409 }{ 410 my $key = md5_hex($1); 411 $g_html_blocks{$key} = $1; 412 "\n\n" . $key . "\n\n"; 413 }egx; 414 415 416 return $text; 417 } 418 419 420 sub _RunBlockGamut { 421 # 422 # These are all the transformations that form block-level 423 # tags like paragraphs, headers, and list items. 424 # 425 my $text = shift; 426 427 $text = _DoHeaders($text); 428 429 # Do Horizontal Rules: 430 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; 431 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; 432 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx; 433 434 $text = _DoLists($text); 435 436 $text = _DoCodeBlocks($text); 437 438 $text = _DoBlockQuotes($text); 439 440 # We already ran _HashHTMLBlocks() before, in Markdown(), but that 441 # was to escape raw HTML in the original Markdown source. This time, 442 # we're escaping the markup we've just created, so that we don't wrap 443 # <p> tags around block-level tags. 444 $text = _HashHTMLBlocks($text); 445 446 $text = _FormParagraphs($text); 447 448 return $text; 449 } 450 451 452 sub _RunSpanGamut { 453 # 454 # These are all the transformations that occur *within* block-level 455 # tags like paragraphs, headers, and list items. 456 # 457 my $text = shift; 458 459 $text = _DoCodeSpans($text); 460 461 $text = _EscapeSpecialChars($text); 462 463 # Process anchor and image tags. Images must come first, 464 # because ![foo][f] looks like an anchor. 465 $text = _DoImages($text); 466 $text = _DoAnchors($text); 467 468 # Make links out of things like `<http://example.com/>` 469 # Must come after _DoAnchors(), because you can use < and > 470 # delimiters in inline links like [this](<url>). 471 $text = _DoAutoLinks($text); 472 473 $text = _EncodeAmpsAndAngles($text); 474 475 $text = _DoItalicsAndBold($text); 476 477 # Do hard breaks: 478 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g; 479 480 return $text; 481 } 482 483 484 sub _EscapeSpecialChars { 485 my $text = shift; 486 my $tokens ||= _TokenizeHTML($text); 487 488 $text = ''; # rebuild $text from the tokens 489 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags. 490 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!; 491 492 foreach my $cur_token (@$tokens) { 493 if ($cur_token->[0] eq "tag") { 494 # Within tags, encode * and _ so they don't conflict 495 # with their use in Markdown for italics and strong. 496 # We're replacing each such character with its 497 # corresponding MD5 checksum value; this is likely 498 # overkill, but it should prevent us from colliding 499 # with the escape values by accident. 500 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx; 501 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx; 502 $text .= $cur_token->[1]; 503 } else { 504 my $t = $cur_token->[1]; 505 $t = _EncodeBackslashEscapes($t); 506 $text .= $t; 507 } 508 } 509 return $text; 510 } 511 512 513 sub _DoAnchors { 514 # 515 # Turn Markdown link shortcuts into XHTML <a> tags. 516 # 517 my $text = shift; 518 519 # 520 # First, handle reference-style links: [link text] [id] 521 # 522 $text =~ s{ 523 ( # wrap whole match in $1 524 \[ 525 ($g_nested_brackets) # link text = $2 526 \] 527 528 [ ]? # one optional space 529 (?:\n[ ]*)? # one optional newline followed by spaces 530 531 \[ 532 (.*?) # id = $3 533 \] 534 ) 535 }{ 536 my $result; 537 my $whole_match = $1; 538 my $link_text = $2; 539 my $link_id = lc $3; 540 541 if ($link_id eq "") { 542 $link_id = lc $link_text; # for shortcut links like [this][]. 543 } 544 545 if (defined $g_urls{$link_id}) { 546 my $url = $g_urls{$link_id}; 547 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 548 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 549 $result = "<a href=\"$url\""; 550 if ( defined $g_titles{$link_id} ) { 551 my $title = $g_titles{$link_id}; 552 $title =~ s! \* !$g_escape_table{'*'}!gx; 553 $title =~ s! _ !$g_escape_table{'_'}!gx; 554 $result .= " title=\"$title\""; 555 } 556 $result .= ">$link_text</a>"; 557 } 558 else { 559 $result = $whole_match; 560 } 561 $result; 562 }xsge; 563 564 # 565 # Next, inline-style links: [link text](url "optional title") 566 # 567 $text =~ s{ 568 ( # wrap whole match in $1 569 \[ 570 ($g_nested_brackets) # link text = $2 571 \] 572 \( # literal paren 573 [ \t]* 574 <?(.*?)>? # href = $3 575 [ \t]* 576 ( # $4 577 (['"]) # quote char = $5 578 (.*?) # Title = $6 579 \5 # matching quote 580 )? # title is optional 581 \) 582 ) 583 }{ 584 my $result; 585 my $whole_match = $1; 586 my $link_text = $2; 587 my $url = $3; 588 my $title = $6; 589 590 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 591 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 592 $result = "<a href=\"$url\""; 593 594 if (defined $title) { 595 $title =~ s/"/"/g; 596 $title =~ s! \* !$g_escape_table{'*'}!gx; 597 $title =~ s! _ !$g_escape_table{'_'}!gx; 598 $result .= " title=\"$title\""; 599 } 600 601 $result .= ">$link_text</a>"; 602 603 $result; 604 }xsge; 605 606 return $text; 607 } 608 609 610 sub _DoImages { 611 # 612 # Turn Markdown image shortcuts into <img> tags. 613 # 614 my $text = shift; 615 616 # 617 # First, handle reference-style labeled images: ![alt text][id] 618 # 619 $text =~ s{ 620 ( # wrap whole match in $1 621 !\[ 622 (.*?) # alt text = $2 623 \] 624 625 [ ]? # one optional space 626 (?:\n[ ]*)? # one optional newline followed by spaces 627 628 \[ 629 (.*?) # id = $3 630 \] 631 632 ) 633 }{ 634 my $result; 635 my $whole_match = $1; 636 my $alt_text = $2; 637 my $link_id = lc $3; 638 639 if ($link_id eq "") { 640 $link_id = lc $alt_text; # for shortcut links like ![this][]. 641 } 642 643 $alt_text =~ s/"/"/g; 644 if (defined $g_urls{$link_id}) { 645 my $url = $g_urls{$link_id}; 646 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 647 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 648 $result = "<img src=\"$url\" alt=\"$alt_text\""; 649 if (defined $g_titles{$link_id}) { 650 my $title = $g_titles{$link_id}; 651 $title =~ s! \* !$g_escape_table{'*'}!gx; 652 $title =~ s! _ !$g_escape_table{'_'}!gx; 653 $result .= " title=\"$title\""; 654 } 655 $result .= $g_empty_element_suffix; 656 } 657 else { 658 # If there's no such link ID, leave intact: 659 $result = $whole_match; 660 } 661 662 $result; 663 }xsge; 664 665 # 666 # Next, handle inline images:  667 # Don't forget: encode * and _ 668 669 $text =~ s{ 670 ( # wrap whole match in $1 671 !\[ 672 (.*?) # alt text = $2 673 \] 674 \( # literal paren 675 [ \t]* 676 <?(\S+?)>? # src url = $3 677 [ \t]* 678 ( # $4 679 (['"]) # quote char = $5 680 (.*?) # title = $6 681 \5 # matching quote 682 [ \t]* 683 )? # title is optional 684 \) 685 ) 686 }{ 687 my $result; 688 my $whole_match = $1; 689 my $alt_text = $2; 690 my $url = $3; 691 my $title = ''; 692 if (defined($6)) { 693 $title = $6; 694 } 695 696 $alt_text =~ s/"/"/g; 697 $title =~ s/"/"/g; 698 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid 699 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold. 700 $result = "<img src=\"$url\" alt=\"$alt_text\""; 701 if (defined $title) { 702 $title =~ s! \* !$g_escape_table{'*'}!gx; 703 $title =~ s! _ !$g_escape_table{'_'}!gx; 704 $result .= " title=\"$title\""; 705 } 706 $result .= $g_empty_element_suffix; 707 708 $result; 709 }xsge; 710 711 return $text; 712 } 713 714 715 sub _DoHeaders { 716 my $text = shift; 717 718 # Setext-style headers: 719 # Header 1 720 # ======== 721 # 722 # Header 2 723 # -------- 724 # 725 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{ 726 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n"; 727 }egmx; 728 729 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{ 730 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n"; 731 }egmx; 732 733 734 # atx-style headers: 735 # # Header 1 736 # ## Header 2 737 # ## Header 2 with closing hashes ## 738 # ... 739 # ###### Header 6 740 # 741 $text =~ s{ 742 ^(\#{1,6}) # $1 = string of #'s 743 [ \t]* 744 (.+?) # $2 = Header text 745 [ \t]* 746 \#* # optional closing #'s (not counted) 747 \n+ 748 }{ 749 my $h_level = length($1); 750 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n"; 751 }egmx; 752 753 return $text; 754 } 755 756 757 sub _DoLists { 758 # 759 # Form HTML ordered (numbered) and unordered (bulleted) lists. 760 # 761 my $text = shift; 762 my $less_than_tab = $g_tab_width - 1; 763 764 # Re-usable patterns to match list item bullets and number markers: 765 my $marker_ul = qr/[*+-]/; 766 my $marker_ol = qr/\d+[.]/; 767 my $marker_any = qr/(?:$marker_ul|$marker_ol)/; 768 769 # Re-usable pattern to match any entirel ul or ol list: 770 my $whole_list = qr{ 771 ( # $1 = whole list 772 ( # $2 773 [ ]{0,$less_than_tab} 774 (${marker_any}) # $3 = first list item marker 775 [ \t]+ 776 ) 777 (?s:.+?) 778 ( # $4 779 \z 780 | 781 \n{2,} 782 (?=\S) 783 (?! # Negative lookahead for another list item marker 784 [ \t]* 785 ${marker_any}[ \t]+ 786 ) 787 ) 788 ) 789 }mx; 790 791 # We use a different prefix before nested lists than top-level lists. 792 # See extended comment in _ProcessListItems(). 793 # 794 # Note: There's a bit of duplication here. My original implementation 795 # created a scalar regex pattern as the conditional result of the test on 796 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx 797 # substitution once, using the scalar as the pattern. This worked, 798 # everywhere except when running under MT on my hosting account at Pair 799 # Networks. There, this caused all rebuilds to be killed by the reaper (or 800 # perhaps they crashed, but that seems incredibly unlikely given that the 801 # same script on the same server ran fine *except* under MT. I've spent 802 # more time trying to figure out why this is happening than I'd like to 803 # admit. My only guess, backed up by the fact that this workaround works, 804 # is that Perl optimizes the substition when it can figure out that the 805 # pattern will never change, and when this optimization isn't on, we run 806 # afoul of the reaper. Thus, the slightly redundant code to that uses two 807 # static s/// patterns rather than one conditional pattern. 808 809 if ($g_list_level) { 810 $text =~ s{ 811 ^ 812 $whole_list 813 }{ 814 my $list = $1; 815 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol"; 816 # Turn double returns into triple returns, so that we can make a 817 # paragraph for the last item in a list, if necessary: 818 $list =~ s/\n{2,}/\n\n\n/g; 819 my $result = _ProcessListItems($list, $marker_any); 820 $result = "<$list_type>\n" . $result . "</$list_type>\n"; 821 $result; 822 }egmx; 823 } 824 else { 825 $text =~ s{ 826 (?:(?<=\n\n)|\A\n?) 827 $whole_list 828 }{ 829 my $list = $1; 830 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol"; 831 # Turn double returns into triple returns, so that we can make a 832 # paragraph for the last item in a list, if necessary: 833 $list =~ s/\n{2,}/\n\n\n/g; 834 my $result = _ProcessListItems($list, $marker_any); 835 $result = "<$list_type>\n" . $result . "</$list_type>\n"; 836 $result; 837 }egmx; 838 } 839 840 841 return $text; 842 } 843 844 845 sub _ProcessListItems { 846 # 847 # Process the contents of a single ordered or unordered list, splitting it 848 # into individual list items. 849 # 850 851 my $list_str = shift; 852 my $marker_any = shift; 853 854 855 # The $g_list_level global keeps track of when we're inside a list. 856 # Each time we enter a list, we increment it; when we leave a list, 857 # we decrement. If it's zero, we're not in a list anymore. 858 # 859 # We do this because when we're not inside a list, we want to treat 860 # something like this: 861 # 862 # I recommend upgrading to version 863 # 8. Oops, now this line is treated 864 # as a sub-list. 865 # 866 # As a single paragraph, despite the fact that the second line starts 867 # with a digit-period-space sequence. 868 # 869 # Whereas when we're inside a list (or sub-list), that line will be 870 # treated as the start of a sub-list. What a kludge, huh? This is 871 # an aspect of Markdown's syntax that's hard to parse perfectly 872 # without resorting to mind-reading. Perhaps the solution is to 873 # change the syntax rules such that sub-lists must start with a 874 # starting cardinal number; e.g. "1." or "a.". 875 876 $g_list_level++; 877 878 # trim trailing blank lines: 879 $list_str =~ s/\n{2,}\z/\n/; 880 881 882 $list_str =~ s{ 883 (\n)? # leading line = $1 884 (^[ \t]*) # leading whitespace = $2 885 ($marker_any) [ \t]+ # list marker = $3 886 ((?s:.+?) # list item text = $4 887 (\n{1,2})) 888 (?= \n* (\z | \2 ($marker_any) [ \t]+)) 889 }{ 890 my $item = $4; 891 my $leading_line = $1; 892 my $leading_space = $2; 893 894 if ($leading_line or ($item =~ m/\n{2,}/)) { 895 $item = _RunBlockGamut(_Outdent($item)); 896 } 897 else { 898 # Recursion for sub-lists: 899 $item = _DoLists(_Outdent($item)); 900 chomp $item; 901 $item = _RunSpanGamut($item); 902 } 903 904 "<li>" . $item . "</li>\n"; 905 }egmx; 906 907 $g_list_level--; 908 return $list_str; 909 } 910 911 912 913 sub _DoCodeBlocks { 914 # 915 # Process Markdown `<pre><code>` blocks. 916 # 917 918 my $text = shift; 919 920 $text =~ s{ 921 (?:\n\n|\A) 922 ( # $1 = the code block -- one or more lines, starting with a space/tab 923 (?: 924 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces 925 .*\n+ 926 )+ 927 ) 928 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc 929 }{ 930 my $codeblock = $1; 931 my $result; # return value 932 933 $codeblock = _EncodeCode(_Outdent($codeblock)); 934 $codeblock = _Detab($codeblock); 935 $codeblock =~ s/\A\n+//; # trim leading newlines 936 $codeblock =~ s/\s+\z//; # trim trailing whitespace 937 938 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n"; 939 940 $result; 941 }egmx; 942 943 return $text; 944 } 945 946 947 sub _DoCodeSpans { 948 # 949 # * Backtick quotes are used for <code></code> spans. 950 # 951 # * You can use multiple backticks as the delimiters if you want to 952 # include literal backticks in the code span. So, this input: 953 # 954 # Just type ``foo `bar` baz`` at the prompt. 955 # 956 # Will translate to: 957 # 958 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> 959 # 960 # There's no arbitrary limit to the number of backticks you 961 # can use as delimters. If you need three consecutive backticks 962 # in your code, use four for delimiters, etc. 963 # 964 # * You can use spaces to get literal backticks at the edges: 965 # 966 # ... type `` `bar` `` ... 967 # 968 # Turns to: 969 # 970 # ... type <code>`bar`</code> ... 971 # 972 973 my $text = shift; 974 975 $text =~ s@ 976 (`+) # $1 = Opening run of ` 977 (.+?) # $2 = The code block 978 (?<!`) 979 \1 # Matching closer 980 (?!`) 981 @ 982 my $c = "$2"; 983 $c =~ s/^[ \t]*//g; # leading whitespace 984 $c =~ s/[ \t]*$//g; # trailing whitespace 985 $c = _EncodeCode($c); 986 "<code>$c</code>"; 987 @egsx; 988 989 return $text; 990 } 991 992 993 sub _EncodeCode { 994 # 995 # Encode/escape certain characters inside Markdown code runs. 996 # The point is that in code, these characters are literals, 997 # and lose their special Markdown meanings. 998 # 999 local $_ = shift; 1000 1001 # Encode all ampersands; HTML entities are not 1002 # entities within a Markdown code span. 1003 s/&/&/g; 1004 1005 # Encode $'s, but only if we're running under Blosxom. 1006 # (Blosxom interpolates Perl variables in article bodies.) 1007 { 1008 no warnings 'once'; 1009 if (defined($blosxom::version)) { 1010 s/\$/$/g; 1011 } 1012 } 1013 1014 1015 # Do the angle bracket song and dance: 1016 s! < !<!gx; 1017 s! > !>!gx; 1018 1019 # Now, escape characters that are magic in Markdown: 1020 s! \* !$g_escape_table{'*'}!gx; 1021 s! _ !$g_escape_table{'_'}!gx; 1022 s! { !$g_escape_table{'{'}!gx; 1023 s! } !$g_escape_table{'}'}!gx; 1024 s! \[ !$g_escape_table{'['}!gx; 1025 s! \] !$g_escape_table{']'}!gx; 1026 s! \\ !$g_escape_table{'\\'}!gx; 1027 1028 return $_; 1029 } 1030 1031 1032 sub _DoItalicsAndBold { 1033 my $text = shift; 1034 1035 # <strong> must go first: 1036 $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 } 1037 {<strong>$2</strong>}gsx; 1038 1039 $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 } 1040 {<em>$2</em>}gsx; 1041 1042 return $text; 1043 } 1044 1045 1046 sub _DoBlockQuotes { 1047 my $text = shift; 1048 1049 $text =~ s{ 1050 ( # Wrap whole match in $1 1051 ( 1052 ^[ \t]*>[ \t]? # '>' at the start of a line 1053 .+\n # rest of the first line 1054 (.+\n)* # subsequent consecutive lines 1055 \n* # blanks 1056 )+ 1057 ) 1058 }{ 1059 my $bq = $1; 1060 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting 1061 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines 1062 $bq = _RunBlockGamut($bq); # recurse 1063 1064 $bq =~ s/^/ /g; 1065 # These leading spaces screw with <pre> content, so we need to fix that: 1066 $bq =~ s{ 1067 (\s*<pre>.+?</pre>) 1068 }{ 1069 my $pre = $1; 1070 $pre =~ s/^ //mg; 1071 $pre; 1072 }egsx; 1073 1074 "<blockquote>\n$bq\n</blockquote>\n\n"; 1075 }egmx; 1076 1077 1078 return $text; 1079 } 1080 1081 1082 sub _FormParagraphs { 1083 # 1084 # Params: 1085 # $text - string to process with html <p> tags 1086 # 1087 my $text = shift; 1088 1089 # Strip leading and trailing lines: 1090 $text =~ s/\A\n+//; 1091 $text =~ s/\n+\z//; 1092 1093 my @grafs = split(/\n{2,}/, $text); 1094 1095 # 1096 # Wrap <p> tags. 1097 # 1098 foreach (@grafs) { 1099 unless (defined( $g_html_blocks{$_} )) { 1100 $_ = _RunSpanGamut($_); 1101 s/^([ \t]*)/<p>/; 1102 $_ .= "</p>"; 1103 } 1104 } 1105 1106 # 1107 # Unhashify HTML blocks 1108 # 1109 foreach (@grafs) { 1110 if (defined( $g_html_blocks{$_} )) { 1111 $_ = $g_html_blocks{$_}; 1112 } 1113 } 1114 1115 return join "\n\n", @grafs; 1116 } 1117 1118 1119 sub _EncodeAmpsAndAngles { 1120 # Smart processing for ampersands and angle brackets that need to be encoded. 1121 1122 my $text = shift; 1123 1124 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: 1125 # http://bumppo.net/projects/amputator/ 1126 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&/g; 1127 1128 # Encode naked <'s 1129 $text =~ s{<(?![a-z/?\$!])}{<}gi; 1130 1131 return $text; 1132 } 1133 1134 1135 sub _EncodeBackslashEscapes { 1136 # 1137 # Parameter: String. 1138 # Returns: The string, with after processing the following backslash 1139 # escape sequences. 1140 # 1141 local $_ = shift; 1142 1143 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first. 1144 s! \\` !$g_escape_table{'`'}!gx; 1145 s! \\\* !$g_escape_table{'*'}!gx; 1146 s! \\_ !$g_escape_table{'_'}!gx; 1147 s! \\\{ !$g_escape_table{'{'}!gx; 1148 s! \\\} !$g_escape_table{'}'}!gx; 1149 s! \\\[ !$g_escape_table{'['}!gx; 1150 s! \\\] !$g_escape_table{']'}!gx; 1151 s! \\\( !$g_escape_table{'('}!gx; 1152 s! \\\) !$g_escape_table{')'}!gx; 1153 s! \\> !$g_escape_table{'>'}!gx; 1154 s! \\\# !$g_escape_table{'#'}!gx; 1155 s! \\\+ !$g_escape_table{'+'}!gx; 1156 s! \\\- !$g_escape_table{'-'}!gx; 1157 s! \\\. !$g_escape_table{'.'}!gx; 1158 s{ \\! }{$g_escape_table{'!'}}gx; 1159 1160 return $_; 1161 } 1162 1163 1164 sub _DoAutoLinks { 1165 my $text = shift; 1166 1167 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi; 1168 1169 # Email addresses: <address@domain.foo> 1170 $text =~ s{ 1171 < 1172 (?:mailto:)? 1173 ( 1174 [-.\w]+ 1175 \@ 1176 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+ 1177 ) 1178 > 1179 }{ 1180 _EncodeEmailAddress( _UnescapeSpecialChars($1) ); 1181 }egix; 1182 1183 return $text; 1184 } 1185 1186 1187 sub _EncodeEmailAddress { 1188 # 1189 # Input: an email address, e.g. "foo@example.com" 1190 # 1191 # Output: the email address as a mailto link, with each character 1192 # of the address encoded as either a decimal or hex entity, in 1193 # the hopes of foiling most address harvesting spam bots. E.g.: 1194 # 1195 # <a href="mailto:foo@e 1196 # xample.com">foo 1197 # @example.com</a> 1198 # 1199 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk 1200 # mailing list: <http://tinyurl.com/yu7ue> 1201 # 1202 1203 my $addr = shift; 1204 1205 srand; 1206 my @encode = ( 1207 sub { '&#' . ord(shift) . ';' }, 1208 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' }, 1209 sub { shift }, 1210 ); 1211 1212 $addr = "mailto:" . $addr; 1213 1214 $addr =~ s{(.)}{ 1215 my $char = $1; 1216 if ( $char eq '@' ) { 1217 # this *must* be encoded. I insist. 1218 $char = $encode[int rand 1]->($char); 1219 } elsif ( $char ne ':' ) { 1220 # leave ':' alone (to spot mailto: later) 1221 my $r = rand; 1222 # roughly 10% raw, 45% hex, 45% dec 1223 $char = ( 1224 $r > .9 ? $encode[2]->($char) : 1225 $r < .45 ? $encode[1]->($char) : 1226 $encode[0]->($char) 1227 ); 1228 } 1229 $char; 1230 }gex; 1231 1232 $addr = qq{<a href="$addr">$addr</a>}; 1233 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part 1234 1235 return $addr; 1236 } 1237 1238 1239 sub _UnescapeSpecialChars { 1240 # 1241 # Swap back in all the special characters we've hidden. 1242 # 1243 my $text = shift; 1244 1245 while( my($char, $hash) = each(%g_escape_table) ) { 1246 $text =~ s/$hash/$char/g; 1247 } 1248 return $text; 1249 } 1250 1251 1252 sub _TokenizeHTML { 1253 # 1254 # Parameter: String containing HTML markup. 1255 # Returns: Reference to an array of the tokens comprising the input 1256 # string. Each token is either a tag (possibly with nested, 1257 # tags contained therein, such as <a href="<MTFoo>">, or a 1258 # run of text between tags. Each element of the array is a 1259 # two-element array; the first is either 'tag' or 'text'; 1260 # the second is the actual value. 1261 # 1262 # 1263 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin. 1264 # <http://www.bradchoate.com/past/mtregex.php> 1265 # 1266 1267 my $str = shift; 1268 my $pos = 0; 1269 my $len = length $str; 1270 my @tokens; 1271 1272 my $depth = 6; 1273 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth); 1274 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment 1275 (?s: <\? .*? \?> ) | # processing instruction 1276 $nested_tags/ix; # nested tags 1277 1278 while ($str =~ m/($match)/g) { 1279 my $whole_tag = $1; 1280 my $sec_start = pos $str; 1281 my $tag_start = $sec_start - length $whole_tag; 1282 if ($pos < $tag_start) { 1283 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)]; 1284 } 1285 push @tokens, ['tag', $whole_tag]; 1286 $pos = pos $str; 1287 } 1288 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len; 1289 \@tokens; 1290 } 1291 1292 1293 sub _Outdent { 1294 # 1295 # Remove one level of line-leading tabs or spaces 1296 # 1297 my $text = shift; 1298 1299 $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm; 1300 return $text; 1301 } 1302 1303 1304 sub _Detab { 1305 # 1306 # Cribbed from a post by Bart Lateur: 1307 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> 1308 # 1309 my $text = shift; 1310 1311 $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge; 1312 return $text; 1313 } 1314 1315 1316 1; 1317 1318 __END__ 1319 1320 1321 =pod 1322 1323 =head1 NAME 1324 1325 B<Markdown> 1326 1327 1328 =head1 SYNOPSIS 1329 1330 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ] 1331 [ I<file> ... ] 1332 1333 1334 =head1 DESCRIPTION 1335 1336 Markdown is a text-to-HTML filter; it translates an easy-to-read / 1337 easy-to-write structured text format into HTML. Markdown's text format 1338 is most similar to that of plain text email, and supports features such 1339 as headers, *emphasis*, code blocks, blockquotes, and links. 1340 1341 Markdown's syntax is designed not as a generic markup language, but 1342 specifically to serve as a front-end to (X)HTML. You can use span-level 1343 HTML tags anywhere in a Markdown document, and you can use block level 1344 HTML tags (like <div> and <table> as well). 1345 1346 For more information about Markdown's syntax, see: 1347 1348 http://daringfireball.net/projects/markdown/ 1349 1350 1351 =head1 OPTIONS 1352 1353 Use "--" to end switch parsing. For example, to open a file named "-z", use: 1354 1355 Markdown.pl -- -z 1356 1357 =over 4 1358 1359 1360 =item B<--html4tags> 1361 1362 Use HTML 4 style for empty element tags, e.g.: 1363 1364 <br> 1365 1366 instead of Markdown's default XHTML style tags, e.g.: 1367 1368 <br /> 1369 1370 1371 =item B<-v>, B<--version> 1372 1373 Display Markdown's version number and copyright information. 1374 1375 1376 =item B<-s>, B<--shortversion> 1377 1378 Display the short-form version number. 1379 1380 1381 =back 1382 1383 1384 1385 =head1 BUGS 1386 1387 To file bug reports or feature requests (other than topics listed in the 1388 Caveats section above) please send email to: 1389 1390 support@daringfireball.net 1391 1392 Please include with your report: (1) the example input; (2) the output 1393 you expected; (3) the output Markdown actually produced. 1394 1395 1396 =head1 VERSION HISTORY 1397 1398 See the readme file for detailed release notes for this version. 1399 1400 1.0.1 - 14 Dec 2004 1401 1402 1.0 - 28 Aug 2004 1403 1404 1405 =head1 AUTHOR 1406 1407 John Gruber 1408 http://daringfireball.net 1409 1410 PHP port and other contributions by Michel Fortin 1411 http://michelf.com 1412 1413 1414 =head1 COPYRIGHT AND LICENSE 1415 1416 Copyright (c) 2003-2004 John Gruber 1417 <http://daringfireball.net/> 1418 All rights reserved. 1419 1420 Redistribution and use in source and binary forms, with or without 1421 modification, are permitted provided that the following conditions are 1422 met: 1423 1424 * Redistributions of source code must retain the above copyright notice, 1425 this list of conditions and the following disclaimer. 1426 1427 * Redistributions in binary form must reproduce the above copyright 1428 notice, this list of conditions and the following disclaimer in the 1429 documentation and/or other materials provided with the distribution. 1430 1431 * Neither the name "Markdown" nor the names of its contributors may 1432 be used to endorse or promote products derived from this software 1433 without specific prior written permission. 1434 1435 This software is provided by the copyright holders and contributors "as 1436 is" and any express or implied warranties, including, but not limited 1437 to, the implied warranties of merchantability and fitness for a 1438 particular purpose are disclaimed. In no event shall the copyright owner 1439 or contributors be liable for any direct, indirect, incidental, special, 1440 exemplary, or consequential damages (including, but not limited to, 1441 procurement of substitute goods or services; loss of use, data, or 1442 profits; or business interruption) however caused and on any theory of 1443 liability, whether in contract, strict liability, or tort (including 1444 negligence or otherwise) arising in any way out of the use of this 1445 software, even if advised of the possibility of such damage. 1446 1447 =cut