From 840a28c7044b3a2d4e5fdfa31858599d04065783 Mon Sep 17 00:00:00 2001
From: Kevin Israel <pleasestand@live.com>
Date: Fri, 11 Oct 2013 22:13:06 -0400
Subject: [PATCH] SECURITY: UtfNormal: add remaining noncharacters

It may be wise to replace all noncharacters with U+FFFD, not just
U+FFFE and U+FFFF, to avoid triggering errors, which might not be
properly handled, in libraries (e.g. PCRE 8.32) and/or other
applications.

Bug: 55548
Change-Id: I63ca3217a882fb4b156e0706285b1216319906f0
---
 includes/normal/RandomTest.php | 15 ++++++++++++---
 includes/normal/Utf8Test.php   |  4 ++--
 includes/normal/UtfNormal.php  | 43 ++++++++++++++++++++++++++++--------------
 3 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/includes/normal/RandomTest.php b/includes/normal/RandomTest.php
index 0602986..c1ed0b9 100644
--- a/includes/normal/RandomTest.php
+++ b/includes/normal/RandomTest.php
@@ -48,9 +48,18 @@ function randomString( $length, $nullOk, $ascii = false ) {
 /* Duplicate of the cleanUp() path for ICU usage */
 function donorm( $str ) {
 	# We exclude a few chars that ICU would not.
-	$str = preg_replace( '/[\x00-\x08\x0b\x0c\x0e-\x1f]/', UTF8_REPLACEMENT, $str );
-	$str = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $str );
-	$str = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $str );
+	$str = preg_replace( '/
+
+		# Control characters illegal in XML
+		[\x00-\x08\x0b\x0c\x0e-\x1f] |
+
+		# U+FDD0..U+FDEF, U+FFFE, U+FFFF
+		\xef(?:\xb7[\x90-\xaf]|\xbf[\xbe\xbf]) |
+
+		# U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, ..., U+10FFFE, U+10FFFF
+		[\xf0-\xf4][\x8f\x9f\xaf\xbf]\xbf[\xbe\xbf]
+
+	/Sx', UTF8_REPLACEMENT, $str );
 
 	# UnicodeString constructor fails if the string ends with a head byte.
 	# Add a junk char at the end, we'll strip it off
diff --git a/includes/normal/Utf8Test.php b/includes/normal/Utf8Test.php
index c5c1be5..734ff7d 100644
--- a/includes/normal/Utf8Test.php
+++ b/includes/normal/Utf8Test.php
@@ -74,8 +74,8 @@ $exceptions = array(
 	# sequences beyond what is now considered legal.
 	'2.1.5', '2.1.6', '2.2.4', '2.2.5', '2.2.6', '2.3.5',
 
-	# Literal 0xffff, which is illegal
-	'2.2.3' );
+	# Noncharacters, which we have chosen to replace with U+FFFD
+	'2.2.3', '2.3.4' );
 
 $longTests = array(
 	# These tests span multiple lines
diff --git a/includes/normal/UtfNormal.php b/includes/normal/UtfNormal.php
index 5a091af..6809bb8 100644
--- a/includes/normal/UtfNormal.php
+++ b/includes/normal/UtfNormal.php
@@ -382,14 +382,24 @@ class UtfNormal {
 								|| ($n == 0xe0 && $sequence <= UTF8_OVERLONG_B)
 								|| ($n == 0xf0 && $sequence <= UTF8_OVERLONG_C)
 
-								# U+FFFE and U+FFFF are explicitly forbidden in Unicode.
-								|| ($n == 0xef &&
-									   ($sequence == UTF8_FFFE)
-									|| ($sequence == UTF8_FFFF) )
+								# Noncharacters are reserved for internal use. The Unicode Standard
+								# doesn't require that we do anything with them, though we replace
+								# U+FFFE and U+FFFF because they are forbidden in XML, and the rest
+								# because some libraries have trouble with them too (e.g. PCRE 8.32).
+								#
+								# Also, Unicode has been limited to 21 bits; longer sequences
+								# (those greater than UTF8_MAX) are not allowed.
 
-								# Unicode has been limited to 21 bits; longer
-								# sequences are not allowed.
-								|| ($n >= 0xf0 && $sequence > UTF8_MAX) ) {
+								|| ($n == 0xef && (
+									   ($sequence >= UTF8_FDD0 && $sequence <= UTF8_FDEF)
+									|| ($sequence == UTF8_FFFE)
+									|| ($sequence == UTF8_FFFF) ) )
+
+								|| ($n >= 0xf0 && (
+									   ($sequence & "\xf0\x8f\xbf\xbe") == "\xf0\x8f\xbf\xbe"
+									|| ($sequence > UTF8_MAX) ) )
+
+								) {
 
 								$replace[] = array( UTF8_REPLACEMENT,
 								                    $base + $i + 1 - strlen( $sequence ),
@@ -766,12 +776,17 @@ class UtfNormal {
 	 * @return String String with the character codes replaced.
 	 */
 	private static function replaceForNativeNormalize( $string ) {
-		$string = preg_replace(
-			'/[\x00-\x08\x0b\x0c\x0e-\x1f]/',
-			UTF8_REPLACEMENT,
-			$string );
-		$string = str_replace( UTF8_FFFE, UTF8_REPLACEMENT, $string );
-		$string = str_replace( UTF8_FFFF, UTF8_REPLACEMENT, $string );
-		return $string;
+		return preg_replace( '/
+
+			# Control characters illegal in XML
+			[\x00-\x08\x0b\x0c\x0e-\x1f] |
+
+			# U+FDD0..U+FDEF, U+FFFE, U+FFFF
+			\xef(?:\xb7[\x90-\xaf]|\xbf[\xbe\xbf]) |
+
+			# U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, ..., U+10FFFE, U+10FFFF
+			[\xf0-\xf4][\x8f\x9f\xaf\xbf]\xbf[\xbe\xbf]
+
+		/Sx', UTF8_REPLACEMENT, $string );
 	}
 }
-- 
1.8.4.2

