diff options
| -rw-r--r-- | api_test/main.c | 124 | ||||
| -rw-r--r-- | src/utf8.c | 65 | ||||
| -rw-r--r-- | src/utf8.h | 1 | 
3 files changed, 179 insertions, 11 deletions
diff --git a/api_test/main.c b/api_test/main.c index 06d9be2..2d65a46 100644 --- a/api_test/main.c +++ b/api_test/main.c @@ -8,6 +8,8 @@  #include "harness.h" +#define UTF8_REPL "\xEF\xBF\xBD" +  static const cmark_node_type node_types[] = {  	CMARK_NODE_DOCUMENT,  	CMARK_NODE_BLOCK_QUOTE, @@ -32,10 +34,25 @@ static const cmark_node_type node_types[] = {  static const int num_node_types = sizeof(node_types) / sizeof(*node_types);  static void +test_md_to_html(test_batch_runner *runner, const char *markdown, +		const char *expected_html, const char *msg); + +static void  test_content(test_batch_runner *runner, cmark_node_type type,  	     int allowed_content);  static void +test_char(test_batch_runner *runner, int valid, const char *utf8, +	  const char *msg); + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, +		     const char *msg); + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8); + +static void  constructor(test_batch_runner *runner)  {  	for (int i = 0; i < num_node_types; ++i) { @@ -436,13 +453,8 @@ test_content(test_batch_runner *runner, cmark_node_type type,  static void  parser(test_batch_runner *runner)  { -	static const char markdown[] = "No newline"; -	cmark_node *doc = cmark_parse_document(markdown, sizeof(markdown) - 1); -	char *html = cmark_render_html(doc); -	STR_EQ(runner, html, "<p>No newline</p>\n", -	       "document without trailing newline"); -	free(html); -	cmark_node_destroy(doc); +	test_md_to_html(runner, "No newline", "<p>No newline</p>\n", +			"document without trailing newline");  }  static void @@ -475,6 +487,103 @@ render_html(test_batch_runner *runner)  	cmark_node_destroy(doc);  } +static void +utf8(test_batch_runner *runner) +{ +	// Ranges +	test_char(runner, 1, "\x01", "valid utf8 01"); +	test_char(runner, 1, "\x7F", "valid utf8 7F"); +	test_char(runner, 0, "\x80", "invalid utf8 80"); +	test_char(runner, 0, "\xBF", "invalid utf8 BF"); +	test_char(runner, 0, "\xC0\x80", "invalid utf8 C080"); +	test_char(runner, 0, "\xC1\xBF", "invalid utf8 C1BF"); +	test_char(runner, 1, "\xC2\x80", "valid utf8 C280"); +	test_char(runner, 1, "\xDF\xBF", "valid utf8 DFBF"); +	test_char(runner, 0, "\xE0\x80\x80", "invalid utf8 E08080"); +	test_char(runner, 0, "\xE0\x9F\xBF", "invalid utf8 E09FBF"); +	test_char(runner, 1, "\xE0\xA0\x80", "valid utf8 E0A080"); +	test_char(runner, 1, "\xED\x9F\xBF", "valid utf8 ED9FBF"); +	test_char(runner, 0, "\xED\xA0\x80", "invalid utf8 EDA080"); +	test_char(runner, 0, "\xED\xBF\xBF", "invalid utf8 EDBFBF"); +	test_char(runner, 0, "\xF0\x80\x80\x80", "invalid utf8 F0808080"); +	test_char(runner, 0, "\xF0\x8F\xBF\xBF", "invalid utf8 F08FBFBF"); +	test_char(runner, 1, "\xF0\x90\x80\x80", "valid utf8 F0908080"); +	test_char(runner, 1, "\xF4\x8F\xBF\xBF", "valid utf8 F48FBFBF"); +	test_char(runner, 0, "\xF4\x90\x80\x80", "invalid utf8 F4908080"); +	test_char(runner, 0, "\xF7\xBF\xBF\xBF", "invalid utf8 F7BFBFBF"); +	test_char(runner, 0, "\xF8", "invalid utf8 F8"); +	test_char(runner, 0, "\xFF", "invalid utf8 FF"); + +	// Incomplete byte sequences at end of input +	test_incomplete_char(runner, "\xE0\xA0", "invalid utf8 E0A0"); +	test_incomplete_char(runner, "\xF0\x90\x80", "invalid utf8 F09080"); + +	// Invalid continuation bytes +	test_continuation_byte(runner, "\xC2\x80"); +	test_continuation_byte(runner, "\xE0\xA0\x80"); +	test_continuation_byte(runner, "\xF0\x90\x80\x80"); +} + +static void +test_char(test_batch_runner *runner, int valid, const char *utf8, +	  const char *msg) +{ +	char buf[20]; +	sprintf(buf, "((((%s))))", utf8); + +	if (valid) { +		char expected[30]; +		sprintf(expected, "<p>((((%s))))</p>\n", utf8); +		test_md_to_html(runner, buf, expected, msg); +	} +	else { +		test_md_to_html(runner, buf, "<p>((((" UTF8_REPL "))))</p>\n", +				msg); +	} +} + +static void +test_incomplete_char(test_batch_runner *runner, const char *utf8, +		     const char *msg) +{ +	char buf[20]; +	sprintf(buf, "----%s", utf8); +	test_md_to_html(runner, buf, "<p>----" UTF8_REPL "</p>\n", msg); +} + +static void +test_continuation_byte(test_batch_runner *runner, const char *utf8) +{ +	int len = strlen(utf8); + +	for (int pos = 1; pos < len; ++pos) { +		char buf[20]; +		sprintf(buf, "((((%s))))", utf8); +		buf[4+pos] = '\x20'; + +		char expected[50]; +		strcpy(expected, "<p>((((" UTF8_REPL "\x20"); +		for (int i = pos + 1; i < len; ++i) { +			strcat(expected, UTF8_REPL); +		} +		strcat(expected, "))))</p>\n"); + +		char *html = cmark_markdown_to_html(buf, strlen(buf)); +		STR_EQ(runner, html, expected, +		       "invalid utf8 continuation byte %d/%d", pos, len); +		free(html); +	} +} + +static void +test_md_to_html(test_batch_runner *runner, const char *markdown, +		const char *expected_html, const char *msg) +{ +	char *html = cmark_markdown_to_html(markdown, strlen(markdown)); +	STR_EQ(runner, html, expected_html, msg); +	free(html); +} +  int main() {  	int retval;  	test_batch_runner *runner = test_batch_runner_new(); @@ -486,6 +595,7 @@ int main() {  	hierarchy(runner);  	parser(runner);  	render_html(runner); +	utf8(runner);  	test_print_summary(runner);  	retval =  test_ok(runner) ? 0 : 1; @@ -28,7 +28,7 @@ static void encode_unknown(strbuf *buf)  	strbuf_put(buf, repl, 3);  } -int utf8proc_charlen(const uint8_t *str, int str_len) +static int utf8proc_charlen(const uint8_t *str, int str_len)  {  	int length, i; @@ -51,6 +51,64 @@ int utf8proc_charlen(const uint8_t *str, int str_len)  	return length;  } +// Validate a single UTF-8 character according to RFC 3629. +static int utf8proc_valid(const uint8_t *str, int str_len) +{ +	int length = utf8proc_charlen(str, str_len); + +	if (length <= 0) +		return length; + +	switch (length) { +	case 1: +		if (str[0] == 0x00) { +			// ASCII NUL is technically valid but rejected +			// for security reasons. +			return -length; +		} +		break; + +	case 2: +		if (str[0] < 0xC2) { +			// Overlong +			return -length; +		} +		break; + +	case 3: +		if (str[0] == 0xE0) { +			if (str[1] < 0xA0) { +				// Overlong +				return -length; +			} +		} +		else if (str[0] == 0xED) { +			if (str[1] >= 0xA0) { +				// Surrogate +				return -length; +			} +		} +		break; + +	case 4: +		if (str[0] == 0xF0) { +			if (str[1] < 0x90) { +				// Overlong +				return -length; +			} +		} +		else if (str[0] >= 0xF4) { +			if (str[0] > 0xF4 || str[1] >= 0x90) { +				// Above 0x10FFFF +				return -length; +			} +		} +		break; +	} + +	return length; +} +  void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)  {  	static const uint8_t whitespace[] = "    "; @@ -60,7 +118,8 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)  	while (i < size) {  		size_t org = i; -		while (i < size && line[i] != '\t' && line[i] < 0x80) { +		while (i < size && line[i] != '\t' && line[i] != '\0' +		       && line[i] < 0x80) {  			i++; tab++;  		} @@ -76,7 +135,7 @@ void utf8proc_detab(strbuf *ob, const uint8_t *line, size_t size)  			i += 1;  			tab += numspaces;  		} else { -			int charlen = utf8proc_charlen(line + i, size - i); +			int charlen = utf8proc_valid(line + i, size - i);  			if (charlen >= 0) {  				strbuf_put(ob, line + i, charlen); @@ -11,7 +11,6 @@ extern "C" {  void utf8proc_case_fold(cmark_strbuf *dest, const uint8_t *str, int len);  void utf8proc_encode_char(int32_t uc, cmark_strbuf *buf);  int utf8proc_iterate(const uint8_t *str, int str_len, int32_t *dst); -int utf8proc_charlen(const uint8_t *str, int str_len);  void utf8proc_detab(cmark_strbuf *dest, const uint8_t *line, size_t size);  int utf8proc_is_space(int32_t uc);  int utf8proc_is_punctuation(int32_t uc);  | 
