r=glob,a=glob git-svn-id: svn://10.0.0.236/trunk@265886 18797224-902f-48f8-a5cc-f745e15eee43
521 lines
15 KiB
Perl
521 lines
15 KiB
Perl
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
#
|
|
# This Source Code Form is "Incompatible With Secondary Licenses", as
|
|
# defined by the Mozilla Public License, v. 2.0.
|
|
|
|
package Bugzilla::Markdown;
|
|
|
|
use 5.10.1;
|
|
use strict;
|
|
use warnings;
|
|
|
|
use Bugzilla::Constants;
|
|
use Bugzilla::Template;
|
|
|
|
use Digest::MD5 qw(md5_hex);
|
|
|
|
use parent qw(Text::MultiMarkdown);
|
|
|
|
@Bugzilla::Markdown::EXPORT = qw(new);
|
|
|
|
# Regex to match balanced [brackets]. See Friedl's
|
|
# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
|
|
our ($g_nested_brackets, $g_nested_parens);
|
|
$g_nested_brackets = qr{
|
|
(?> # Atomic matching
|
|
[^\[\]]+ # Anything other than brackets
|
|
|
|
|
\[
|
|
(??{ $g_nested_brackets }) # Recursive set of nested brackets
|
|
\]
|
|
)*
|
|
}x;
|
|
# Doesn't allow for whitespace, because we're using it to match URLs:
|
|
$g_nested_parens = qr{
|
|
(?> # Atomic matching
|
|
[^()\s]+ # Anything other than parens or whitespace
|
|
|
|
|
\(
|
|
(??{ $g_nested_parens }) # Recursive set of nested brackets
|
|
\)
|
|
)*
|
|
}x;
|
|
|
|
our %g_escape_table;
|
|
foreach my $char (split //, '\\`*_{}[]()>#+-.!~') {
|
|
$g_escape_table{$char} = md5_hex($char);
|
|
}
|
|
$g_escape_table{'<'} = md5_hex('<');
|
|
|
|
sub new {
|
|
my $invocant = shift;
|
|
my $class = ref $invocant || $invocant;
|
|
return $class->SUPER::new(tab_width => MARKDOWN_TAB_WIDTH,
|
|
# Bugzilla uses HTML not XHTML
|
|
empty_element_suffix => '>');
|
|
}
|
|
|
|
sub markdown {
|
|
my $self = shift;
|
|
my $text = shift;
|
|
my $user = Bugzilla->user;
|
|
|
|
if ($user->settings->{use_markdown}->{is_enabled}
|
|
&& $user->setting('use_markdown') eq 'on')
|
|
{
|
|
return $self->SUPER::markdown($text, @_);
|
|
}
|
|
|
|
return Bugzilla::Template::quoteUrls($text);
|
|
}
|
|
|
|
sub _Markdown {
|
|
my $self = shift;
|
|
my $text = shift;
|
|
|
|
$text = Bugzilla::Template::quoteUrls($text, undef, undef, undef, undef, 1);
|
|
|
|
return $self->SUPER::_Markdown($text, @_);
|
|
}
|
|
|
|
sub _RunSpanGamut {
|
|
# These are all the transformations that occur *within* block-level
|
|
# tags like paragraphs, headers, and list items.
|
|
|
|
my ($self, $text) = @_;
|
|
|
|
$text = $self->_DoCodeSpans($text);
|
|
$text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
|
|
$text = $self->_EscapeSpecialChars($text);
|
|
|
|
$text = $self->_DoAnchors($text);
|
|
|
|
# Strikethroughs is Bugzilla's extension
|
|
$text = $self->_DoStrikethroughs($text);
|
|
|
|
$text = $self->_DoAutoLinks($text);
|
|
$text = $self->_EncodeAmpsAndAngles($text);
|
|
$text = $self->_DoItalicsAndBold($text);
|
|
|
|
$text =~ s/\n/<br$self->{empty_element_suffix}\n/g;
|
|
|
|
return $text;
|
|
}
|
|
|
|
# Override to check for HTML-escaped <>" chars.
|
|
sub _StripLinkDefinitions {
|
|
#
|
|
# Strips link definitions from text, stores the URLs and titles in
|
|
# hash references.
|
|
#
|
|
my ($self, $text) = @_;
|
|
my $less_than_tab = $self->{tab_width} - 1;
|
|
|
|
# Link defs are in the form: ^[id]: url "optional title"
|
|
while ($text =~ s{
|
|
^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
|
|
[ \t]*
|
|
\n? # maybe *one* newline
|
|
[ \t]*
|
|
(?:<)?<a\s+href="(.+?)">\2</a>(?:>)? # url = \$2
|
|
[ \t]*
|
|
\n? # maybe one newline
|
|
[ \t]*
|
|
(?:
|
|
(?<=\s) # lookbehind for whitespace
|
|
(?:"|\()
|
|
(.+?) # title = \$3
|
|
(?:"|\))
|
|
[ \t]*
|
|
)? # title is optional
|
|
(?:\n+|\Z)
|
|
}{}omx) {
|
|
$self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
|
|
if ($3) {
|
|
$self->{_titles}{lc $1} = $3;
|
|
$self->{_titles}{lc $1} =~ s/"/"/g;
|
|
}
|
|
|
|
}
|
|
|
|
return $text;
|
|
}
|
|
|
|
# We need to look for HTML-escaped '<' and '>' (i.e. < and >).
|
|
# We also remove Email linkification from the original implementation
|
|
# as it is already done in Bugzilla's quoteUrls().
|
|
sub _DoAutoLinks {
|
|
my ($self, $text) = @_;
|
|
|
|
$text =~ s{(?:<|<)((?:https?|ftp):[^'">\s]+?)(?:>|>)}{<a href="$1">$1</a>}gi;
|
|
return $text;
|
|
}
|
|
|
|
# The main reasons for overriding this method are
|
|
# resolving URL conflicts with Bugzilla's quoteUrls()
|
|
# and also changing '"' to '"' in regular expressions wherever needed.
|
|
sub _DoAnchors {
|
|
#
|
|
# Turn Markdown link shortcuts into <a> tags.
|
|
#
|
|
my ($self, $text) = @_;
|
|
|
|
# We revert linkifications of non-email links and only
|
|
# those links whose URL and title are the same because
|
|
# this way we can be sure that link is generated by quoteUrls()
|
|
$text =~ s@<a \s+ href="(?! mailto ) (.+?)">\1</a>@$1@xmg;
|
|
|
|
#
|
|
# First, handle reference-style links: [link text] [id]
|
|
#
|
|
$text =~ s{
|
|
( # wrap whole match in $1
|
|
\[
|
|
($g_nested_brackets) # link text = $2
|
|
\]
|
|
|
|
[ ]? # one optional space
|
|
(?:\n[ ]*)? # one optional newline followed by spaces
|
|
|
|
\[
|
|
(.*?) # id = $3
|
|
\]
|
|
)
|
|
}{
|
|
my $whole_match = $1;
|
|
my $link_text = $2;
|
|
my $link_id = lc $3;
|
|
|
|
if ($link_id eq "") {
|
|
$link_id = lc $link_text; # for shortcut links like [this][].
|
|
}
|
|
|
|
$link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
|
|
|
|
$self->_GenerateAnchor($whole_match, $link_text, $link_id);
|
|
}xsge;
|
|
|
|
#
|
|
# Next, inline-style links: [link text](url "optional title")
|
|
#
|
|
$text =~ s{
|
|
( # wrap whole match in $1
|
|
\[
|
|
($g_nested_brackets) # link text = $2
|
|
\]
|
|
\( # literal paren
|
|
[ \t]*
|
|
($g_nested_parens) # href = $3
|
|
[ \t]*
|
|
( # $4
|
|
("|') # quote char = $5
|
|
(.*?) # Title = $6
|
|
\5 # matching quote
|
|
[ \t]* # ignore any spaces/tabs between closing quote and )
|
|
)? # title is optional
|
|
\)
|
|
)
|
|
}{
|
|
my $result;
|
|
my $whole_match = $1;
|
|
my $link_text = $2;
|
|
my $url = $3;
|
|
my $title = $6;
|
|
|
|
# Remove Bugzilla quoteUrls() linkification
|
|
if ($url =~ /^a href="/ && $url =~ m|</a$|) {
|
|
$url =~ s/^[^>]+>//;
|
|
$url =~ s@</a$@@;
|
|
}
|
|
|
|
my $safe_url_regexp = Bugzilla::Template::SAFE_URL_REGEXP();
|
|
$url = "http://$url" unless $url =~ /^$safe_url_regexp$/;
|
|
|
|
$self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
|
|
}xsge;
|
|
|
|
#
|
|
# Last, handle reference-style shortcuts: [link text]
|
|
# These must come last in case you've also got [link test][1]
|
|
# or [link test](/foo)
|
|
#
|
|
$text =~ s{
|
|
( # wrap whole match in $1
|
|
\[
|
|
([^\[\]]+) # link text = $2; can't contain '[' or ']'
|
|
\]
|
|
)
|
|
}{
|
|
my $result;
|
|
my $whole_match = $1;
|
|
my $link_text = $2;
|
|
(my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
|
|
|
|
$self->_GenerateAnchor($whole_match, $link_text, $link_id);
|
|
}xsge;
|
|
|
|
return $text;
|
|
}
|
|
|
|
# The purpose of overriding this function is to add support
|
|
# for a Github Flavored Markdown (GFM) feature called 'Multiple
|
|
# underscores in words'. The standard markdown specification
|
|
# specifies the underscore for making the text emphasized/bold.
|
|
# However, some variable names in programming languages contain underscores
|
|
# and we do not want a part of those variables to look emphasized/bold.
|
|
# Instead, we render them as the way they originally are.
|
|
sub _DoItalicsAndBold {
|
|
my ($self, $text) = @_;
|
|
|
|
# Handle at beginning of lines:
|
|
$text =~ s{ (^__ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S)) }
|
|
{
|
|
my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
|
|
$result;
|
|
}gsxe;
|
|
|
|
$text =~ s{ ^\*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
|
|
|
|
$text =~ s{ (^_ (?=\S) (.+?) (?<=\S) _ (?!\S)) }
|
|
{
|
|
my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
|
|
$result;
|
|
}gsxe;
|
|
|
|
$text =~ s{ ^\* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
|
|
|
|
# <strong> must go first:
|
|
$text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S) ) }
|
|
{
|
|
my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
|
|
$result;
|
|
}gsxe;
|
|
|
|
|
|
$text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
|
|
|
|
$text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (?!\S) ) }
|
|
{
|
|
my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
|
|
$result;
|
|
}gsxe;
|
|
|
|
$text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
|
|
|
|
# And now, a second pass to catch nested strong and emphasis special cases
|
|
$text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (\S*) ) }
|
|
{
|
|
my $result = _has_multiple_underscores($3) ? $1 : "<strong>$2</strong>$3";
|
|
$result;
|
|
}gsxe;
|
|
|
|
$text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
|
|
$text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (\S*) ) }
|
|
{
|
|
my $result = _has_multiple_underscores($3) ? $1 : "<em>$2</em>$3";
|
|
$result;
|
|
}gsxe;
|
|
|
|
$text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
|
|
|
|
return $text;
|
|
}
|
|
|
|
sub _DoStrikethroughs {
|
|
my ($self, $text) = @_;
|
|
|
|
$text =~ s{ ^ ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
|
|
$text =~ s{ (?<=_|[^~\w]) ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
|
|
|
|
return $text;
|
|
}
|
|
|
|
# The original _DoCodeSpans() uses the 's' modifier in its regex
|
|
# which prevents _DoCodeBlocks() to match GFM fenced code blocks.
|
|
# We copy the code from the original implementation and remove the
|
|
# 's' modifier from it.
|
|
sub _DoCodeSpans {
|
|
my ($self, $text) = @_;
|
|
|
|
$text =~ s@
|
|
(?<!\\) # Character before opening ` can't be a backslash
|
|
(`+) # $1 = Opening run of `
|
|
(.+?) # $2 = The code block
|
|
(?<!`)
|
|
\1 # Matching closer
|
|
(?!`)
|
|
@
|
|
my $c = "$2";
|
|
$c =~ s/^[ \t]*//g; # leading whitespace
|
|
$c =~ s/[ \t]*$//g; # trailing whitespace
|
|
$c = $self->_EncodeCode($c);
|
|
"<code>$c</code>";
|
|
@egx;
|
|
|
|
return $text;
|
|
}
|
|
|
|
# Override to add GFM Fenced Code Blocks
|
|
sub _DoCodeBlocks {
|
|
my ($self, $text) = @_;
|
|
|
|
$text =~ s{
|
|
^ `{3,} [\s\t]* \n
|
|
( # $1 = the entire code block
|
|
(?: .* \n+)+?
|
|
)
|
|
`{3,} [\s\t]* $
|
|
}{
|
|
my $codeblock = $1;
|
|
my $result;
|
|
|
|
$codeblock = $self->_EncodeCode($codeblock);
|
|
$codeblock = $self->_Detab($codeblock);
|
|
$codeblock =~ s/\n\z//; # remove the trailing newline
|
|
|
|
$result = "\n\n<pre><code>" . $codeblock . "</code></pre>\n\n";
|
|
$result;
|
|
}egmx;
|
|
|
|
# And now do the standard code blocks
|
|
$text = $self->SUPER::_DoCodeBlocks($text);
|
|
|
|
return $text;
|
|
}
|
|
|
|
sub _DoBlockQuotes {
|
|
my ($self, $text) = @_;
|
|
|
|
$text =~ s{
|
|
( # Wrap whole match in $1
|
|
(?:
|
|
^[ \t]*>[ \t]? # '>' at the start of a line
|
|
.+\n # rest of the first line
|
|
(?:.+\n)* # subsequent consecutive lines
|
|
\n* # blanks
|
|
)+
|
|
)
|
|
}{
|
|
my $bq = $1;
|
|
$bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
|
|
$bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
|
|
$bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse
|
|
$bq =~ s/^/ /mg;
|
|
# These leading spaces screw with <pre> content, so we need to fix that:
|
|
$bq =~ s{(\s*<pre>.+?</pre>)}{
|
|
my $pre = $1;
|
|
$pre =~ s/^ //mg;
|
|
$pre;
|
|
}egs;
|
|
"<blockquote class=\"markdown\">\n$bq\n</blockquote>\n\n";
|
|
}egmx;
|
|
|
|
return $text;
|
|
}
|
|
|
|
sub _EncodeCode {
|
|
my ($self, $text) = @_;
|
|
|
|
# We need to unescape the escaped HTML characters in code blocks.
|
|
# These are the reverse of the escapings done in Bugzilla::Util::html_quote()
|
|
$text =~ s/</</g;
|
|
$text =~ s/>/>/g;
|
|
$text =~ s/"/"/g;
|
|
$text =~ s/@/@/g;
|
|
# '&' substitution must be the last one, otherwise a literal like '>'
|
|
# will turn to '>' because '&' is already changed to '&' in Bugzilla::Util::html_quote().
|
|
# In other words, html_quote() will change '>' to '&gt;' and then we will
|
|
# change '&gt' -> '>' -> '>' if we write this substitution as the first one.
|
|
$text =~ s/&/&/g;
|
|
$text =~ s{<a \s+ href="(?:mailto:)? (.+?)"> \1 </a>}{$1}xmgi;
|
|
$text = $self->SUPER::_EncodeCode($text);
|
|
$text =~ s/~/$g_escape_table{'~'}/go;
|
|
# Encode '<' to prevent URLs from getting linkified in code spans
|
|
$text =~ s/</$g_escape_table{'<'}/go;
|
|
|
|
return $text;
|
|
}
|
|
|
|
sub _EncodeBackslashEscapes {
|
|
my ($self, $text) = @_;
|
|
|
|
$text = $self->SUPER::_EncodeBackslashEscapes($text);
|
|
$text =~ s/\\~/$g_escape_table{'~'}/go;
|
|
|
|
return $text;
|
|
}
|
|
|
|
sub _UnescapeSpecialChars {
|
|
my ($self, $text) = @_;
|
|
|
|
$text = $self->SUPER::_UnescapeSpecialChars($text);
|
|
$text =~ s/$g_escape_table{'~'}/~/go;
|
|
$text =~ s/$g_escape_table{'<'}/</go;
|
|
|
|
return $text;
|
|
}
|
|
|
|
# Check if the passed string is of the form multiple_underscores_in_a_word.
|
|
# To check that, we first need to make sure that the string does not contain
|
|
# any white-space. Then, if the string is composed of non-space chunks which
|
|
# are bound together with underscores, the string has the desired form.
|
|
sub _has_multiple_underscores {
|
|
my $string = shift;
|
|
return 0 unless defined($string) && length($string);
|
|
return 0 if $string =~ /[\t\s]+/;
|
|
return 1 if scalar (split /_/, $string) > 1;
|
|
return 0;
|
|
}
|
|
|
|
1;
|
|
|
|
__END__
|
|
|
|
=head1 NAME
|
|
|
|
Bugzilla::Markdown - Generates HTML output from structured plain-text input.
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
use Bugzilla::Markdown;
|
|
|
|
my $markdown = Bugzilla::Markdown->new();
|
|
print $markdown->markdown($text);
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
Bugzilla::Markdown implements a Markdown engine that produces
|
|
an HTML-based output from a given plain-text input.
|
|
|
|
The majority of the implementation is done by C<Text::MultiMarkdown>
|
|
CPAN module. It also applies the linkifications done in L<Bugzilla::Template>
|
|
to the input resulting in an output which is a combination of both Markdown
|
|
structures and those defined by Bugzilla itself.
|
|
|
|
=head2 Accessors
|
|
|
|
=over
|
|
|
|
=item C<markdown>
|
|
|
|
C<string> Produces an HTML-based output string based on the structures
|
|
and format defined in the given plain-text input.
|
|
|
|
=over
|
|
|
|
=item B<Params>
|
|
|
|
=over
|
|
|
|
=item C<text>
|
|
|
|
C<string> A plain-text string which includes Markdown structures.
|
|
|
|
=back
|
|
|
|
=back
|
|
|
|
=back
|