diff options
| author | Nick Wellnhofer <wellnhofer@aevum.de> | 2015-06-09 19:27:33 +0200 | 
|---|---|---|
| committer | Nick Wellnhofer <wellnhofer@aevum.de> | 2015-06-09 19:32:35 +0200 | 
| commit | 38f6ac470d3b597446d4663a00efbe6ebce8ee5e (patch) | |
| tree | 75af766e702d5899959b91ae7bd99e186e846283 /src | |
| parent | 8d997c85ee1452480ed3d821ce0642f7e6e5b9e6 (diff) | |
Further optimize utf8proc_valid
Assume a multi-byte sequence and rework switch statement into if/else
for another 2% speedup.
Diffstat (limited to 'src')
| -rw-r--r-- | src/utf8.c | 71 | 
1 files changed, 34 insertions, 37 deletions
| @@ -54,9 +54,11 @@ static int utf8proc_charlen(const uint8_t *str, bufsize_t str_len)  }  // Validate a single UTF-8 character according to RFC 3629. +// Assumes a multi-byte UTF-8 sequence.  static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)  {  	int length = utf8proc_utf8class[str[0]]; +	assert(length != 1);  	if (!length)  		return -1; @@ -64,53 +66,48 @@ static int utf8proc_valid(const uint8_t *str, bufsize_t str_len)  	if ((bufsize_t)length > str_len)  		return -str_len; -	switch (length) { -	case 2: -		if ((str[1] & 0xC0) != 0x80) -			return -1; +	if ((str[1] & 0xC0) != 0x80) +		return -1; + +	if (length == 2) {  		if (str[0] < 0xC2) {  			// Overlong  			return -length;  		} -		break; - -	case 3: -		if ((str[1] & 0xC0) != 0x80) -			return -1; +	} +	else {  		if ((str[2] & 0xC0) != 0x80)  			return -2; -		if (str[0] == 0xE0) { -			if (str[1] < 0xA0) { -				// Overlong -				return -length; -			} -		} else if (str[0] == 0xED) { -			if (str[1] >= 0xA0) { -				// Surrogate -				return -length; -			} -		} -		break; -	case 4: -		if ((str[1] & 0xC0) != 0x80) -			return -1; -		if ((str[2] & 0xC0) != 0x80) -			return -2; -		if ((str[3] & 0xC0) != 0x80) -			return -3; -		if (str[0] == 0xF0) { -			if (str[1] < 0x90) { -				// Overlong -				return -length; +		if (length == 3) { +			if (str[0] == 0xE0) { +				if (str[1] < 0xA0) { +					// Overlong +					return -length; +				} +			} else if (str[0] == 0xED) { +				if (str[1] >= 0xA0) { +					// Surrogate +					return -length; +				}  			} -		} else if (str[0] >= 0xF4) { -			if (str[0] > 0xF4 || str[1] >= 0x90) { -				// Above 0x10FFFF -				return -length; +		} +		else { +			if ((str[3] & 0xC0) != 0x80) +				return -3; + +			if (str[0] == 0xF0) { +				if (str[1] < 0x90) { +					// Overlong +					return -length; +				} +			} else if (str[0] >= 0xF4) { +				if (str[0] > 0xF4 || str[1] >= 0x90) { +					// Above 0x10FFFF +					return -length; +				}  			}  		} -		break;  	}  	return length; | 
