bzrmirror%bugzilla.org 00fc957fe6 Bug 1059684: markdown text should not be rendered within a <pre> tag
r=glob,a=glob


git-svn-id: svn://10.0.0.236/trunk@265644 18797224-902f-48f8-a5cc-f745e15eee43
2014-10-28 03:15:50 +00:00

521 lines
15 KiB
Perl

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
#
# This Source Code Form is "Incompatible With Secondary Licenses", as
# defined by the Mozilla Public License, v. 2.0.
package Bugzilla::Markdown;
use 5.10.1;
use strict;
use warnings;
use Bugzilla::Constants;
use Bugzilla::Template;
use Digest::MD5 qw(md5_hex);
use parent qw(Text::Markdown);
@Bugzilla::Markdown::EXPORT = qw(new);
# Regex to match balanced [brackets]. See Friedl's
# "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
our ($g_nested_brackets, $g_nested_parens);
$g_nested_brackets = qr{
(?> # Atomic matching
[^\[\]]+ # Anything other than brackets
|
\[
(??{ $g_nested_brackets }) # Recursive set of nested brackets
\]
)*
}x;
# Doesn't allow for whitespace, because we're using it to match URLs:
$g_nested_parens = qr{
(?> # Atomic matching
[^()\s]+ # Anything other than parens or whitespace
|
\(
(??{ $g_nested_parens }) # Recursive set of nested brackets
\)
)*
}x;
our %g_escape_table;
foreach my $char (split //, '\\`*_{}[]()>#+-.!~') {
$g_escape_table{$char} = md5_hex($char);
}
$g_escape_table{'&lt;'} = md5_hex('&lt;');
sub new {
my $invocant = shift;
my $class = ref $invocant || $invocant;
return $class->SUPER::new(tab_width => MARKDOWN_TAB_WIDTH,
# Bugzilla uses HTML not XHTML
empty_element_suffix => '>');
}
sub markdown {
my $self = shift;
my $text = shift;
my $user = Bugzilla->user;
if ($user->settings->{use_markdown}->{is_enabled}
&& $user->setting('use_markdown') eq 'on')
{
return $self->SUPER::markdown($text, @_);
}
return Bugzilla::Template::quoteUrls($text);
}
sub _Markdown {
my $self = shift;
my $text = shift;
$text = Bugzilla::Template::quoteUrls($text, undef, undef, undef, undef, 1);
return $self->SUPER::_Markdown($text, @_);
}
sub _RunSpanGamut {
# These are all the transformations that occur *within* block-level
# tags like paragraphs, headers, and list items.
my ($self, $text) = @_;
$text = $self->_DoCodeSpans($text);
$text = $self->_EscapeSpecialCharsWithinTagAttributes($text);
$text = $self->_EscapeSpecialChars($text);
$text = $self->_DoAnchors($text);
# Strikethroughs is Bugzilla's extension
$text = $self->_DoStrikethroughs($text);
$text = $self->_DoAutoLinks($text);
$text = $self->_EncodeAmpsAndAngles($text);
$text = $self->_DoItalicsAndBold($text);
$text =~ s/\n/<br$self->{empty_element_suffix}\n/g;
return $text;
}
# Override to check for HTML-escaped <>" chars.
sub _StripLinkDefinitions {
#
# Strips link definitions from text, stores the URLs and titles in
# hash references.
#
my ($self, $text) = @_;
my $less_than_tab = $self->{tab_width} - 1;
# Link defs are in the form: ^[id]: url "optional title"
while ($text =~ s{
^[ ]{0,$less_than_tab}\[(.+)\]: # id = \$1
[ \t]*
\n? # maybe *one* newline
[ \t]*
(?:&lt;)?<a\s+href="(.+?)">\2</a>(?:&gt;)? # url = \$2
[ \t]*
\n? # maybe one newline
[ \t]*
(?:
(?<=\s) # lookbehind for whitespace
(?:&quot;|\()
(.+?) # title = \$3
(?:&quot;|\))
[ \t]*
)? # title is optional
(?:\n+|\Z)
}{}omx) {
$self->{_urls}{lc $1} = $self->_EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
if ($3) {
$self->{_titles}{lc $1} = $3;
$self->{_titles}{lc $1} =~ s/"/&quot;/g;
}
}
return $text;
}
# We need to look for HTML-escaped '<' and '>' (i.e. &lt; and &gt;).
# We also remove Email linkification from the original implementation
# as it is already done in Bugzilla's quoteUrls().
sub _DoAutoLinks {
my ($self, $text) = @_;
$text =~ s{(?:<|&lt;)((?:https?|ftp):[^'">\s]+?)(?:>|&gt;)}{<a href="$1">$1</a>}gi;
return $text;
}
# The main reasons for overriding this method are
# resolving URL conflicts with Bugzilla's quoteUrls()
# and also changing '"' to '&quot;' in regular expressions wherever needed.
sub _DoAnchors {
#
# Turn Markdown link shortcuts into <a> tags.
#
my ($self, $text) = @_;
# We revert linkifications of non-email links and only
# those links whose URL and title are the same because
# this way we can be sure that link is generated by quoteUrls()
$text =~ s@<a \s+ href="(?! mailto ) (.+?)">\1</a>@$1@xmg;
#
# First, handle reference-style links: [link text] [id]
#
$text =~ s{
( # wrap whole match in $1
\[
($g_nested_brackets) # link text = $2
\]
[ ]? # one optional space
(?:\n[ ]*)? # one optional newline followed by spaces
\[
(.*?) # id = $3
\]
)
}{
my $whole_match = $1;
my $link_text = $2;
my $link_id = lc $3;
if ($link_id eq "") {
$link_id = lc $link_text; # for shortcut links like [this][].
}
$link_id =~ s{[ ]*\n}{ }g; # turn embedded newlines into spaces
$self->_GenerateAnchor($whole_match, $link_text, $link_id);
}xsge;
#
# Next, inline-style links: [link text](url "optional title")
#
$text =~ s{
( # wrap whole match in $1
\[
($g_nested_brackets) # link text = $2
\]
\( # literal paren
[ \t]*
($g_nested_parens) # href = $3
[ \t]*
( # $4
(&quot;|') # quote char = $5
(.*?) # Title = $6
\5 # matching quote
[ \t]* # ignore any spaces/tabs between closing quote and )
)? # title is optional
\)
)
}{
my $result;
my $whole_match = $1;
my $link_text = $2;
my $url = $3;
my $title = $6;
# Remove Bugzilla quoteUrls() linkification
if ($url =~ /^a href="/ && $url =~ m|</a$|) {
$url =~ s/^[^>]+>//;
$url =~ s@</a$@@;
}
# Limit URL to HTTP/HTTPS links
$url = "http://$url" unless $url =~ m!^https?://!i;
$self->_GenerateAnchor($whole_match, $link_text, undef, $url, $title);
}xsge;
#
# Last, handle reference-style shortcuts: [link text]
# These must come last in case you've also got [link test][1]
# or [link test](/foo)
#
$text =~ s{
( # wrap whole match in $1
\[
([^\[\]]+) # link text = $2; can't contain '[' or ']'
\]
)
}{
my $result;
my $whole_match = $1;
my $link_text = $2;
(my $link_id = lc $2) =~ s{[ ]*\n}{ }g; # lower-case and turn embedded newlines into spaces
$self->_GenerateAnchor($whole_match, $link_text, $link_id);
}xsge;
return $text;
}
# The purpose of overriding this function is to add support
# for a Github Flavored Markdown (GFM) feature called 'Multiple
# underscores in words'. The standard markdown specification
# specifies the underscore for making the text emphasized/bold.
# However, some variable names in programming languages contain underscores
# and we do not want a part of those variables to look emphasized/bold.
# Instead, we render them as the way they originally are.
sub _DoItalicsAndBold {
my ($self, $text) = @_;
# Handle at beginning of lines:
$text =~ s{ (^__ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S)) }
{
my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
$result;
}gsxe;
$text =~ s{ ^\*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
$text =~ s{ (^_ (?=\S) (.+?) (?<=\S) _ (?!\S)) }
{
my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
$result;
}gsxe;
$text =~ s{ ^\* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
# <strong> must go first:
$text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (?!\S) ) }
{
my $result = _has_multiple_underscores($2) ? $1 : "<strong>$2</strong>";
$result;
}gsxe;
$text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
$text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (?!\S) ) }
{
my $result = _has_multiple_underscores($2) ? $1 : "<em>$2</em>";
$result;
}gsxe;
$text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
# And now, a second pass to catch nested strong and emphasis special cases
$text =~ s{ ( (?<=\W) __ (?=\S) (.+?[*_]*) (?<=\S) __ (\S*) ) }
{
my $result = _has_multiple_underscores($3) ? $1 : "<strong>$2</strong>$3";
$result;
}gsxe;
$text =~ s{ (?<=\W) \*\* (?=\S) (.+?[*_]*) (?<=\S) \*\* }{<strong>$1</strong>}gsx;
$text =~ s{ ( (?<=\W) _ (?=\S) (.+?) (?<=\S) _ (\S*) ) }
{
my $result = _has_multiple_underscores($3) ? $1 : "<em>$2</em>$3";
$result;
}gsxe;
$text =~ s{ (?<=\W) \* (?=\S) (.+?) (?<=\S) \* }{<em>$1</em>}gsx;
return $text;
}
sub _DoStrikethroughs {
my ($self, $text) = @_;
$text =~ s{ ^ ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
$text =~ s{ (?<=_|[^~\w]) ~~ (?=\S) ([^~]+?) (?<=\S) ~~ (?!~) }{<del>$1</del>}gsx;
return $text;
}
# The original _DoCodeSpans() uses the 's' modifier in its regex
# which prevents _DoCodeBlocks() to match GFM fenced code blocks.
# We copy the code from the original implementation and remove the
# 's' modifier from it.
sub _DoCodeSpans {
my ($self, $text) = @_;
$text =~ s@
(?<!\\) # Character before opening ` can't be a backslash
(`+) # $1 = Opening run of `
(.+?) # $2 = The code block
(?<!`)
\1 # Matching closer
(?!`)
@
my $c = "$2";
$c =~ s/^[ \t]*//g; # leading whitespace
$c =~ s/[ \t]*$//g; # trailing whitespace
$c = $self->_EncodeCode($c);
"<code>$c</code>";
@egx;
return $text;
}
# Override to add GFM Fenced Code Blocks
sub _DoCodeBlocks {
my ($self, $text) = @_;
$text =~ s{
^ `{3,} [\s\t]* \n
( # $1 = the entire code block
(?: .* \n+)+?
)
`{3,} [\s\t]* $
}{
my $codeblock = $1;
my $result;
$codeblock = $self->_EncodeCode($codeblock);
$codeblock = $self->_Detab($codeblock);
$codeblock =~ s/\n\z//; # remove the trailing newline
$result = "\n\n<pre><code>" . $codeblock . "</code></pre>\n\n";
$result;
}egmx;
# And now do the standard code blocks
$text = $self->SUPER::_DoCodeBlocks($text);
return $text;
}
sub _DoBlockQuotes {
my ($self, $text) = @_;
$text =~ s{
( # Wrap whole match in $1
(?:
^[ \t]*&gt;[ \t]? # '>' at the start of a line
.+\n # rest of the first line
(?:.+\n)* # subsequent consecutive lines
\n* # blanks
)+
)
}{
my $bq = $1;
$bq =~ s/^[ \t]*&gt;[ \t]?//gm; # trim one level of quoting
$bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
$bq = $self->_RunBlockGamut($bq, {wrap_in_p_tags => 1}); # recurse
$bq =~ s/^/ /mg;
# These leading spaces screw with <pre> content, so we need to fix that:
$bq =~ s{(\s*<pre>.+?</pre>)}{
my $pre = $1;
$pre =~ s/^ //mg;
$pre;
}egs;
"<blockquote>\n$bq\n</blockquote>\n\n";
}egmx;
return $text;
}
sub _EncodeCode {
my ($self, $text) = @_;
# We need to unescape the escaped HTML characters in code blocks.
# These are the reverse of the escapings done in Bugzilla::Util::html_quote()
$text =~ s/&lt;/</g;
$text =~ s/&gt;/>/g;
$text =~ s/&quot;/"/g;
$text =~ s/&#64;/@/g;
# '&amp;' substitution must be the last one, otherwise a literal like '&gt;'
# will turn to '>' because '&' is already changed to '&amp;' in Bugzilla::Util::html_quote().
# In other words, html_quote() will change '&gt;' to '&amp;gt;' and then we will
# change '&amp;gt' -> '&gt;' -> '>' if we write this substitution as the first one.
$text =~ s/&amp;/&/g;
$text =~ s{<a \s+ href="(?:mailto:)? (.+?)"> \1 </a>}{$1}xmgi;
$text = $self->SUPER::_EncodeCode($text);
$text =~ s/~/$g_escape_table{'~'}/go;
# Encode '&lt;' to prevent URLs from getting linkified in code spans
$text =~ s/&lt;/$g_escape_table{'&lt;'}/go;
return $text;
}
sub _EncodeBackslashEscapes {
my ($self, $text) = @_;
$text = $self->SUPER::_EncodeBackslashEscapes($text);
$text =~ s/\\~/$g_escape_table{'~'}/go;
return $text;
}
sub _UnescapeSpecialChars {
my ($self, $text) = @_;
$text = $self->SUPER::_UnescapeSpecialChars($text);
$text =~ s/$g_escape_table{'~'}/~/go;
$text =~ s/$g_escape_table{'&lt;'}/&lt;/go;
return $text;
}
# Check if the passed string is of the form multiple_underscores_in_a_word.
# To check that, we first need to make sure that the string does not contain
# any white-space. Then, if the string is composed of non-space chunks which
# are bound together with underscores, the string has the desired form.
sub _has_multiple_underscores {
my $string = shift;
return 0 unless defined($string) && length($string);
return 0 if $string =~ /[\t\s]+/;
return 1 if scalar (split /_/, $string) > 1;
return 0;
}
1;
__END__
=head1 NAME
Bugzilla::Markdown - Generates HTML output from structured plain-text input.
=head1 SYNOPSIS
use Bugzilla::Markdown;
my $markdown = Bugzilla::Markdown->new();
print $markdown->markdown($text);
=head1 DESCRIPTION
Bugzilla::Markdown implements a Markdown engine that produces
an HTML-based output from a given plain-text input.
The majority of the implementation is done by C<Text::Markdown>
CPAN module. It also applies the linkifications done in L<Bugzilla::Template>
to the input resulting in an output which is a combination of both Markdown
structures and those defined by Bugzilla itself.
=head2 Accessors
=over
=item C<markdown>
C<string> Produces an HTML-based output string based on the structures
and format defined in the given plain-text input.
=over
=item B<Params>
=over
=item C<text>
C<string> A plain-text string which includes Markdown structures.
=back
=back
=back