Index: RELEASE-NOTES =================================================================== RCS file: /cvsroot/wikipedia/phase3/RELEASE-NOTES,v retrieving revision 1.348 diff -u -r1.348 RELEASE-NOTES --- RELEASE-NOTES 28 Jul 2005 07:47:49 -0000 1.348 +++ RELEASE-NOTES 29 Jul 2005 09:58:42 -0000 @@ -614,6 +614,8 @@ * Fix typo in undefined array index access prevention * (bug 2947) Update namespaces for sr localization * (bug 2952) Added Asturian language file with translated namespaces +* (bug 2676) Apply a protective transformation on editing input/output + for browsers that hit the Unicode blacklist. Patch by plugwash. === Caveats === Index: includes/EditPage.php =================================================================== RCS file: /cvsroot/wikipedia/phase3/includes/EditPage.php,v retrieving revision 1.196 diff -u -r1.196 EditPage.php --- includes/EditPage.php 25 Jul 2005 07:00:20 -0000 1.196 +++ includes/EditPage.php 29 Jul 2005 09:58:43 -0000 @@ -211,8 +211,8 @@ # These fields need to be checked for encoding. # Also remove trailing whitespace, but don't remove _initial_ # whitespace from the text boxes. This may be significant formatting. - $this->textbox1 = rtrim( $request->getText( 'wpTextbox1' ) ); - $this->textbox2 = rtrim( $request->getText( 'wpTextbox2' ) ); + $this->textbox1 = $this->safeUnicodeInput( $request, 'wpTextbox1' ); + $this->textbox2 = $this->safeUnicodeInput( $request, 'wpTextbox2' ); $this->mMetaData = rtrim( $request->getText( 'metadata' ) ); $this->summary = $request->getText( 'wpSummary' ); @@ -699,6 +699,9 @@ } else $metadata = "" ; + $safemodehtml = $this->checkUnicodeCompliantBrowser() + ? "" + : "\n"; $wgOut->addHTML( << END -. htmlspecialchars( $wgContLang->recodeForEdit( $this->textbox1 ) ) . +. htmlspecialchars( $this->safeUnicodeOutput( $this->textbox1 ) ) . " {$metadata}
{$editsummary} {$checkboxhtml} +{$safemodehtml} addWikiText( '==' . wfMsg( "yourtext" ) . '==' ); $wgOut->addHTML( "" ); } @@ -1162,6 +1166,122 @@ return '
' . $difftext . '
'; } + /** + * Filter an input field through a Unicode de-armoring process if it + * came from an old browser with known broken Unicode editing issues. + * + * @param WebRequest $request + * @param string $field + * @return string + * @access private + */ + function safeUnicodeInput( $request, $field ) { + $text = rtrim( $request->getText( $field ) ); + return $request->getBool( 'safemode' ) + ? $this->unmakesafe( $text ) + : $text; + } + + /** + * Filter an output field through a Unicode de-armoring process if it + * came from an old browser with known broken Unicode editing issues. + * + * @param string $text + * @return string + * @access private + */ + function safeUnicodeOutput( $text ) { + global $wgContLang; + $codedText = $wgContLang->recodeForEdit( $text ); + return $this->checkUnicodeCompliantBrowser() + ? $codedText + : $this->makesafe( $codedText ); + } + + /** + * A number of web browsers are known to corrupt non-ASCII characters + * in a UTF-8 text editing environment. To protect against this, + * detected browsers will be served an armored version of the text, + * with non-ASCII chars converted to numeric HTML character references. + * + * Preexisting such character references will have a 0 added to them + * to ensure that round-trips do not alter the original data. + * + * @param string $invalue + * @return string + * @access private + */ + function makesafe( $invalue ) { + // Armor existing references for reversability. + $invalue = strtr( $invalue, array( "&#x" => "�" ) ); + + $bytesleft = 0; + $result = ""; + $working = 0; + for( $i = 0; $i < strlen( $invalue ); $i++ ) { + $bytevalue = ord( $invalue{$i} ); + if( $bytevalue <= 0x7F ) { //0xxx xxxx + $result .= chr( $bytevalue ); + $bytesleft = 0; + } elseif( $bytevalue <= 0xBF ) { //10xx xxxx + $working = $working << 6; + $working += ($bytevalue & 0x3F); + $bytesleft--; + if( $bytesleft <= 0 ) { + $result .= "&#x" . strtoupper( dechex( $working ) ) . ";"; + } + } elseif( $bytevalue <= 0xDF ) { //110x xxxx + $working = $bytevalue & 0x1F; + $bytesleft = 1; + } elseif( $bytevalue <= 0xEF ) { //1110 xxxx + $working = $bytevalue & 0x0F; + $bytesleft = 2; + } else { //1111 0xxx + $working = $bytevalue & 0x07; + $bytesleft = 3; + } + } + return $result; + } + + /** + * Reverse the previously applied transliteration of non-ASCII characters + * back to UTF-8. Used to protect data from corruption by broken web browsers + * as listed in $wgBrowserBlackList. + * + * @param string $invalue + * @return string + * @access private + */ + function unmakesafe( $invalue ) { + $result = ""; + for( $i = 0; $i < strlen( $invalue ); $i++ ) { + if( ( substr( $invalue, $i, 3 ) == "&#x" ) && ( $invalue{$i+3} != '0' ) ) { + $i += 3; + $hexstring = ""; + do { + $hexstring .= $invalue{$i}; + $i++; + } while( ctype_xdigit( $invalue{$i} ) && ( $i < strlen( $invalue ) ) ); + + // Do some sanity checks. These aren't needed for reversability, + // but should help keep the breakage down if the editor + // breaks one of the entities whilst editing. + if ((substr($invalue,$i,1)==";") and (strlen($hexstring) <= 6)) { + $codepoint = hexdec($hexstring); + $result .= codepointToUtf8( $codepoint ); + } else { + $result .= "&#x" . $hexstring . substr( $invalue, $i, 1 ); + } + } else { + $result .= substr( $invalue, $i, 1 ); + } + } + // reverse the transform that we made for reversability reasons. + return strtr( $result, array( "�" => "&#x" ) ); + } + + } ?> Index: languages/Language.php =================================================================== RCS file: /cvsroot/wikipedia/phase3/languages/Language.php,v retrieving revision 1.630 diff -u -r1.630 Language.php --- languages/Language.php 26 Jul 2005 04:23:22 -0000 1.630 +++ languages/Language.php 29 Jul 2005 09:58:44 -0000 @@ -730,7 +730,7 @@ press \"Save page\".
", 'yourtext' => 'Your text', 'storedversion' => 'Stored version', -'nonunicodebrowser' => "WARNING: Your browser is not unicode compliant, please change it before editing an article.", +'nonunicodebrowser' => "WARNING: Your browser is not unicode compliant. A workaround is in place to allow you to safely edit articles: non-ASCII characters will appear in the edit box as hexadecimal codes.", 'editingold' => "WARNING: You are editing an out-of-date revision of this page. If you save it, any changes made since this revision will be lost.",