diff options
| -rw-r--r-- | js/lib/inlines.js | 12 | ||||
| -rw-r--r-- | spec.txt | 242 | ||||
| -rw-r--r-- | src/inlines.c | 14 | 
3 files changed, 244 insertions, 24 deletions
diff --git a/js/lib/inlines.js b/js/lib/inlines.js index c799d0d..297d31f 100644 --- a/js/lib/inlines.js +++ b/js/lib/inlines.js @@ -41,6 +41,8 @@ var HTMLTAG = "(?:" + OPENTAG + "|" + CLOSETAG + "|" + HTMLCOMMENT + "|" +          PROCESSINGINSTRUCTION + "|" + DECLARATION + "|" + CDATA + ")";  var ENTITY = "&(?:#x[a-f0-9]{1,8}|#[0-9]{1,8}|[a-z][a-z0-9]{1,31});"; +var rePunctuation = new RegExp(/^[\u2000-\u206F\u2E00-\u2E7F\\'!"#\$%&\(\)\*\+,\-\.\/:;<=>\?@\[\]\^_`\{\|\}~]/); +  var reHtmlTag = new RegExp('^' + HTMLTAG, 'i');  var reLinkTitle = new RegExp( @@ -227,8 +229,14 @@ var scanDelims = function(cc) {          char_after = fromCodePoint(cc_after);      } -    var can_open = numdelims > 0 && !(/\s/.test(char_after)); -    var can_close = numdelims > 0 && !(/\s/.test(char_before)); +    var can_open = numdelims > 0 && !(/\s/.test(char_after)) && +            !(rePunctuation.test(char_after) && +             !(/\s/.test(char_before)) && +             !(rePunctuation.test(char_before))); +    var can_close = numdelims > 0 && !(/\s/.test(char_before)) && +            !(rePunctuation.test(char_before) && +              !(/\s/.test(char_after)) && +              !(rePunctuation.test(char_after)));      if (cc === C_UNDERSCORE) {          can_open = can_open && !((/[a-z0-9]/i).test(char_before));          can_close = can_close && !((/[a-z0-9]/i).test(char_after)); @@ -4390,36 +4390,107 @@ internal emphasis: foo*bar*baz  no emphasis: foo_bar_baz  ``` -The following rules capture all of these patterns, while allowing -for efficient parsing strategies that do not backtrack: +The rules given below capture all of these patterns, while allowing +for efficient parsing strategies that do not backtrack. + +First, some definitions.  A [delimiter run](@delimiter-run) is either +a sequence of one or more `*` characters that is not preceded or +followed by a `*` character, or a sequence of one or more `_` +characters that is not preceded or followed by a `_` character. + +A [left-flanking delimiter run](@right-facing-delimiter-run) is +a [delimiter run](#delimiter-run) that is (a) not followed by [unicode +whitespace](#unicode-whitespace), and (b) either not followed by a +[punctuation character](#punctuation-character), or +preceded by [unicode whitespace](#unicode-whitespace) or +a [punctuation character](#punctuation-character). + +A [right-flanking delimiter run](@left-facing-delimiter-run) is +a [delimiter run](#delimiter-run) that is (a) not preceded by [unicode +whitespace](#unicode-whitespace), and (b) either not preceded by a +[punctuation character](#punctuation-character), or +followed by [unicode whitespace](#unicode-whitespace) or +a [punctuation character](#punctuation-character). + +Here are some examples of delimiter runs. + +  - left-flanking but not right-flanking: + +    ``` +    ***abc +      _abc +    **"abc" +     _"abc" +    ``` + +  - right-flanking but not left-flanking: + +    ``` +    abc*** +      abc_ +    "abc"** +     _"abc" +    ``` + +  - Both right and right-flanking: + +    ``` +    abc***def +    "abc"_"def" +    ``` + +  - Neither right nor right-flanking: + +    ``` +    abc *** def +    a _ b +    ``` + +(The idea of distinguishing left-flanking and right-flanking +delimiter runs based on the character before and the character +after comes from Roopesh Chander's +[vfmd](http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags). +vfmd uses the terminology "emphasis indicator string" instead of "delimiter +run," and its rules for distinguishing left- and right-flanking runs +are a bit more complex than the ones given here.) + +The following rules define emphasis and strong emphasis:  1.  A single `*` character [can open emphasis](@can-open-emphasis) -    iff it is not followed by [unicode whitespace](#unicode-whitespace). +    iff it is part of a +    [left-flanking delimiter run](#right-facing-delimiter-run).  2.  A single `_` character [can open emphasis](#can-open-emphasis) iff -    it is not followed by [unicode whitespace](#unicode-whitespace) -    and it is not preceded by an ASCII alphanumeric character. +    it is part of a +    [left-flanking delimiter run](#right-facing-delimiter-run) +    and is not preceded by an ASCII alphanumeric character.  3.  A single `*` character [can close emphasis](@can-close-emphasis) -    iff it is not preceded by [unicode whitespace](#unicode-whitespace). +    iff it is part of a +    [left-flanking delimiter run](#right-facing-delimiter-run). -4.  A single `_` character [can close emphasis](#can-close-emphasis) iff -    it is not preceded by [unicode whitespace](#unicode-whitespace) +4.  A single `_` character [can close emphasis](#can-close-emphasis) +    iff it is part of a +    [left-flanking delimiter run](#right-facing-delimiter-run).      and it is not followed by an ASCII alphanumeric character.  5.  A double `**` [can open strong emphasis](@can-open-strong-emphasis) -    iff it is not followed by [unicode whitespace](#unicode-whitespace). +    iff it is part of a +    [left-flanking delimiter run](#right-facing-delimiter-run).  6.  A double `__` [can open strong emphasis](#can-open-strong-emphasis) -    iff it is not followed by [unicode whitespace](#unicode-whitespace) -    and it is not preceded by an ASCII alphanumeric character. +    iff it is part of a +    [left-flanking delimiter run](#right-facing-delimiter-run) +    and is not preceded by an ASCII alphanumeric character.  7.  A double `**` [can close strong emphasis](@can-close-strong-emphasis) -    iff it is not preceded by [unicode whitespace](#unicode-whitespace). +    iff it is part of a +    [right-flanking delimiter run](#right-facing-delimiter-run).  8.  A double `__` [can close strong emphasis](#can-close-strong-emphasis) -    iff it is not preceded by [unicode whitespace](#unicode-whitespace) -    and it is not followed by an ASCII alphanumeric character. +    iff it is part of a +    [right-flanking delimiter run](#right-facing-delimiter-run). +    and is not followed by an ASCII alphanumeric character.  9.  Emphasis begins with a delimiter that [can open      emphasis](#can-open-emphasis) and ends with a delimiter that [can close @@ -4487,7 +4558,8 @@ Rule 1:  .  This is not emphasis, because the opening `*` is followed by -whitespace: +whitespace, and hence not part of a [left-flanking delimiter +run](#right-facing-delimiter-run):  .  a * foo bar* @@ -4495,6 +4567,16 @@ a * foo bar*  <p>a * foo bar*</p>  . +This is not emphasis, because the opening `*` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run](#right-facing-delimiter-run): + +. +a*"foo"* +. +<p>a*"foo"*</p> +. +  Unicode nonbreaking spaces count as whitespace, too:  . @@ -4534,6 +4616,15 @@ _ foo bar_  <p>_ foo bar_</p>  . +This is not emphasis, because the opening `_` is preceded +by an alphanumeric and followed by punctuation: + +. +a_"foo"_ +. +<p>a_"foo"_</p> +. +  Emphasis with `_` is not allowed inside ASCII words:  . @@ -4558,6 +4649,15 @@ But it is permitted inside non-ASCII words:  Rule 3: +This is not emphasis, because the closing delimiter does +not match the opening delimiter: + +. +_foo* +. +<p>_foo*</p> +. +  This is not emphasis, because the closing `*` is preceded by  whitespace: @@ -4567,6 +4667,26 @@ whitespace:  <p>*foo bar *</p>  . +This is not emphasis, because the second `*` is +preceded by punctuation and followed by an alphanumeric +(hence it is not part of a [right-flanking delimiter +run](#left-facing-delimiter-run): + +. +*(*foo) +. +<p>*(*foo)</p> +. + +The point of this restriction is more easily appreciated +with this example: + +. +*(*foo*)* +. +<p><em>(<em>foo</em>)</em></p> +. +  Intraword emphasis with `*` is allowed:  . @@ -4587,7 +4707,24 @@ _foo bar _  <p>_foo bar _</p>  . -Intraword emphasis: +This is not emphasis, because the second `_` is +preceded by punctuation and followed by an alphanumeric: + +. +_(_foo) +. +<p>_(_foo)</p> +. + +This is emphasis within emphasis: + +. +_(_foo_)_ +. +<p><em>(<em>foo</em>)</em></p> +. + +Intraword emphasis is disallowed for `_`:  .  _foo_bar @@ -4624,6 +4761,16 @@ followed by whitespace:  <p>** foo bar**</p>  . +This is not strong emphasis, because the opening `**` is preceded +by an alphanumeric and followed by punctuation, and hence +not part of a [left-flanking delimiter run](#right-facing-delimiter-run): + +. +a**"foo"** +. +<p>a**"foo"**</p> +. +  Intraword strong emphasis with `**` is permitted:  . @@ -4649,7 +4796,16 @@ __ foo bar__  <p>__ foo bar__</p>  . -Intraword emphasis examples: +This is not strong emphasis, because the opening `__` is preceded +by an alphanumeric and followed by punctuation: + +. +a__"foo"__ +. +<p>a__"foo"__</p> +. + +Intraword strong emphasis is forbidden with `__`:  .  foo__bar__ @@ -4689,6 +4845,38 @@ by whitespace:  (Nor can it be interpreted as an emphasized `*foo bar *`, because of  Rule 11.) +This is not strong emphasis, because the second `**` is +preceded by punctuation and followed by an alphanumeric: + +. +**(**foo) +. +<p>**(**foo)</p> +. + +The point of this restriction is more easily appreciated +with these examples: + +. +*(**foo**)* +. +<p><em>(<strong>foo</strong>)</em></p> +. + +. +**Gomphocarpus (*Gomphocarpus physocarpus*, syn. +*Asclepias physocarpa*)** +. +<p><strong>Gomphocarpus (<em>Gomphocarpus physocarpus</em>, syn. +<em>Asclepias physocarpa</em>)</strong></p> +. + +. +**foo "*bar*" foo** +. +<p><strong>foo "<em>bar</em>" foo</strong></p> +. +  Intraword emphasis:  . @@ -4708,7 +4896,25 @@ __foo bar __  <p>__foo bar __</p>  . -Intraword strong emphasis examples: +This is not strong emphasis, because the second `__` is +preceded by punctuation and followed by an alphanumeric: + +. +__(__foo) +. +<p>__(__foo)</p> +. + +The point of this restriction is more easily appreciated +with this example: + +. +_(__foo__)_ +. +<p><em>(<strong>foo</strong>)</em></p> +. + +Intraword strong emphasis is forbidden with `__`:  .  __foo__bar diff --git a/src/inlines.c b/src/inlines.c index f63fabe..3f69837 100644 --- a/src/inlines.c +++ b/src/inlines.c @@ -261,7 +261,7 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)  		}  		len = utf8proc_iterate(subj->input.data + before_char_pos,  				 subj->pos - before_char_pos, &before_char); -		if (len == 0) { +		if (len == -1) {  			before_char = 10;  		}  	} @@ -273,11 +273,17 @@ scan_delims(subject* subj, unsigned char c, bool * can_open, bool * can_close)  	len = utf8proc_iterate(subj->input.data + subj->pos,  			 subj->input.len - subj->pos, &after_char); -	if (len == 0) { +	if (len == -1) {  		after_char = 10;  	} -	*can_open = numdelims > 0 && !utf8proc_is_space(after_char); -	*can_close = numdelims > 0 && !utf8proc_is_space(before_char); +	*can_open = numdelims > 0 && !utf8proc_is_space(after_char) && +		!(utf8proc_is_punctuation(after_char) && +		  !utf8proc_is_space(before_char) && +		  !utf8proc_is_punctuation(before_char)); +	*can_close = numdelims > 0 && !utf8proc_is_space(before_char) && +		!(utf8proc_is_punctuation(before_char) && +		  !utf8proc_is_space(after_char) && +		  !utf8proc_is_punctuation(after_char));  	if (c == '_') {  		*can_open = *can_open &&  			!(before_char < 128 && isalnum((char)before_char));  | 
