diff options
| -rw-r--r-- | src/CMakeLists.txt | 2 | ||||
| -rw-r--r-- | src/html.c | 114 | ||||
| -rw-r--r-- | src/smart.c | 146 | ||||
| -rw-r--r-- | src/smart.h | 28 | ||||
| -rw-r--r-- | test/smart_punct.txt | 4 | 
5 files changed, 182 insertions, 112 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2179c08..2150e7a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ set(HEADERS    html_unescape.h    houdini.h    cmark_ctype.h +  smart.h    )  set(LIBRARY_SOURCES    cmark.c @@ -36,6 +37,7 @@ set(LIBRARY_SOURCES    houdini_html_e.c    houdini_html_u.c    cmark_ctype.c +  smart.c    ${HEADERS}    ) @@ -9,6 +9,7 @@  #include "utf8.h"  #include "buffer.h"  #include "houdini.h" +#include "smart.h"  // Functions to convert cmark_nodes to HTML strings. @@ -61,10 +62,6 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,  	char start_header[] = "<h0";  	char end_header[] = "</h0";  	bool tight; -	int lastout, i; -	cmark_chunk lit; -	char before_char, after_char, c; -	bool left_flanking, right_flanking;  	bool entering = (ev_type == CMARK_EVENT_ENTER); @@ -223,112 +220,9 @@ S_render_node(cmark_node *node, cmark_event_type ev_type,  	case CMARK_NODE_TEXT:  		if (options & CMARK_OPT_SMARTPUNCT) { -			lastout = 0; -			i = 0; -			lit = node->as.literal; -			while (i < lit.len) { -				c = lit.data[i]; -				// replace with efficient lookup table: -				if (c != '"' && c != '-' && c != '\'' && c != '.') { -					i++; -					continue; -				} -				escape_html(html, lit.data + lastout, -				            i - lastout); -				if (c == '\'' || c == '"') { -					if (i == 0) { -						if (node->prev) { -							if (node->prev->type == CMARK_NODE_TEXT) { -								before_char = node->prev->as.literal.data[node->prev->as.literal.len - 1]; -							} else if (node->prev->type == CMARK_NODE_SOFTBREAK || -							           node->prev->type == CMARK_NODE_LINEBREAK) { -								before_char = '\n'; -							} else { -								before_char = 'x'; -							} -						} else { -							before_char = '\n'; -						} -					} else { -						before_char = lit.data[i - 1]; -					} -					if (i >= lit.len - 1) { -						if (node->next) { -							if (node->next->type == CMARK_NODE_TEXT) { -								after_char = node->next->as.literal.data[0]; -							} else if (node->next->type == CMARK_NODE_SOFTBREAK || -							           node->next->type == CMARK_NODE_LINEBREAK) { -								before_char = '\n'; -							} else { -								after_char = 'x'; -							} -						} else { -							after_char = '\n'; -						} -					} else { -						after_char = lit.data[i + 1]; -					} -					left_flanking = !utf8proc_is_space(after_char) && -					                !(utf8proc_is_punctuation(after_char) && -					                  !utf8proc_is_space(before_char) && -					                  !utf8proc_is_punctuation(before_char)); -					right_flanking = !utf8proc_is_space(before_char) && -					                 !(utf8proc_is_punctuation(before_char) && -					                   !utf8proc_is_space(after_char) && -					                   !utf8proc_is_punctuation(after_char)); -				} -				switch (lit.data[i]) { -				case '"': -					if (right_flanking) { -						cmark_strbuf_puts(html, "”"); -					} else { -						cmark_strbuf_puts(html, "“"); -					} -					i += 1; -					break; -				case '\'': -					if (left_flanking && !right_flanking) { -						cmark_strbuf_puts(html, "‘"); -					} else { -						cmark_strbuf_puts(html, "’"); -					} -					i += 1; -					break; -				case '-': -					if (i < lit.len - 1 && lit.data[i + 1] == '-') { -						if (lit.data[i + 2] == '-') { -							cmark_strbuf_puts(html, -							                  "—"); -							i += 3; -						} else { -							cmark_strbuf_puts(html, "–"); -							i += 2; -						} -					} else { -						cmark_strbuf_putc(html, c); -						i += 1; -					} -					break; -				case '.': -					if (i < lit.len - 2 && lit.data[i + 1] == '.' && -					    lit.data[i + 2] == '.') { -						cmark_strbuf_puts(html, -						                  "…"); -						i += 3; -					} else { -						cmark_strbuf_putc(html, c); -						i += 1; -					} -					break; -				default: -					cmark_strbuf_putc(html, c); -					i++; -				} -				lastout = i; -			} -			escape_html(html, node->as.literal.data + lastout, -			            i - lastout); - +			escape_with_smart(html, node, escape_html, +					  "“", "”", "‘", "’", +					  "—", "–", "…");  		} else {  			escape_html(html, node->as.literal.data,  			            node->as.literal.len); diff --git a/src/smart.c b/src/smart.c new file mode 100644 index 0000000..54c9740 --- /dev/null +++ b/src/smart.c @@ -0,0 +1,146 @@ +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> + +#include "config.h" +#include "cmark.h" +#include "node.h" +#include "utf8.h" +#include "buffer.h" +#include "chunk.h" + +void escape_with_smart(cmark_strbuf *buf, +		       cmark_node *node, +		       void (*escape)(cmark_strbuf *, const unsigned char *, int), +		       const char *left_double_quote, +		       const char *right_double_quote, +		       const char *left_single_quote, +		       const char *right_single_quote, +		       const char *em_dash, +		       const char *en_dash, +		       const char *ellipses) +{ +	int32_t c = 0; +	int32_t after_char = 0; +	int32_t before_char = 0; +	int len; +	bool left_flanking, right_flanking; +	int lastout = 0; +	int i = 0; +	cmark_chunk lit = node->as.literal; + +	// set before_char based on previous text node if there is one: +	if (node->prev) { +		if (node->prev->type == CMARK_NODE_TEXT) { + +			// walk back to the beginning of the UTF_8 sequence: +			i = node->prev->as.literal.len - 1; +			while (i > 0 && node->prev->as.literal.data[i] >> 6 == 2) { +				i -= 1; +			} +			len = utf8proc_iterate(node->prev->as.literal.data + i, +					       node->prev->as.literal.len - i, +					       &before_char); +			if (len == -1) { +				before_char = 10; +			} + +		} else if (node->prev->type == CMARK_NODE_SOFTBREAK || +			   node->prev->type == CMARK_NODE_LINEBREAK) { +			before_char = 10; + +		} else { +			before_char = 65; +		} +	} else { +		before_char = 10; +	} + +	while (i < lit.len) { +		len = utf8proc_iterate(lit.data + i, lit.len - i, &c); +		i += len; + +		// replace with efficient lookup table: +		if (!(c == 34 || c == 39 || c == 45 || c == 46)) { +			before_char = c; +			continue; +		} +		(*escape)(buf, lit.data + lastout, i - len - lastout); + +		if (c == 34 || c == 39) { + +			if (i >= lit.len) { +				if (node->next) { +					if (node->next->type == CMARK_NODE_TEXT) { +						utf8proc_iterate(node->next->as.literal.data, +								 node->next->as.literal.len, +								 &after_char); +					} else if (node->next->type == CMARK_NODE_SOFTBREAK || +						   node->next->type == CMARK_NODE_LINEBREAK) { +						after_char = 10; +					} else { +						after_char = 65; +					} +				} else { +					after_char = 10; +				} +			} else { +				utf8proc_iterate(lit.data + i, lit.len - i, &after_char); +			} + +			left_flanking = !utf8proc_is_space(after_char) && +				!(utf8proc_is_punctuation(after_char) && +				  !utf8proc_is_space(before_char) && +				  !utf8proc_is_punctuation(before_char)); +			right_flanking = !utf8proc_is_space(before_char) && +				!(utf8proc_is_punctuation(before_char) && +				  !utf8proc_is_space(after_char) && +				  !utf8proc_is_punctuation(after_char)); +		} + +		switch (c) { +		case 34: // " +			if (right_flanking) { +				cmark_strbuf_puts(buf, right_double_quote); +			} else { +				cmark_strbuf_puts(buf, left_double_quote); +			} +			break; +		case 39: // ' +			if (left_flanking && !right_flanking) { +				cmark_strbuf_puts(buf, left_single_quote); +			} else { +				cmark_strbuf_puts(buf, right_single_quote); +			} +			break; +		case 45: // - +			if (i < lit.len && lit.data[i] == '-') { +				if (lit.data[i + 1] == '-') { +					cmark_strbuf_puts(buf, em_dash); +					i += 2; +				} else { +					cmark_strbuf_puts(buf, en_dash); +					i += 1; +				} +			} else { +				cmark_strbuf_putc(buf, c); +			} +			break; +		case 46: // . +			if (i < lit.len - 1 && lit.data[i] == '.' && +			    lit.data[i + 1] == '.') { +				cmark_strbuf_puts(buf, ellipses); +				i += 2; +			} else { +				cmark_strbuf_putc(buf, c); +			} +			break; +		default: +			cmark_strbuf_putc(buf, c); +		} +		lastout = i; +	} +	(*escape)(buf, node->as.literal.data + lastout, lit.len - lastout); + +} diff --git a/src/smart.h b/src/smart.h new file mode 100644 index 0000000..fa614b3 --- /dev/null +++ b/src/smart.h @@ -0,0 +1,28 @@ +#ifndef CMARK_SMART_H +#define CMARK_SMART_H + +#include <stddef.h> +#include <stdarg.h> +#include "config.h" + +#ifdef __cplusplus +extern "C" { +#endif + +void escape_with_smart(cmark_strbuf *buf, +		       cmark_node *node, +		       void (*escape)(cmark_strbuf *, const unsigned char *, int), +		       const char *left_double_quote, +		       const char *right_double_quote, +		       const char *left_single_quote, +		       const char *right_single_quote, +		       const char *em_dash, +		       const char *en_dash, +		       const char *ellipses); + +#ifdef __cplusplus +} +#endif + +#endif + diff --git a/test/smart_punct.txt b/test/smart_punct.txt index c036a6d..c870c9d 100644 --- a/test/smart_punct.txt +++ b/test/smart_punct.txt @@ -35,9 +35,9 @@ Were you alive in the 70's?  .  . -Here is some quoted '`code`' and a "[quoted link][1]". +Here is some quoted '`code`' and a "[quoted link](url)".  . -<p>Here is some quoted ‘<code>code</code>’ and a “[quoted link][1]”.</p> +<p>Here is some quoted ‘<code>code</code>’ and a “<a href="url">quoted link</a>”.</p>  .  .  | 
