Index: RELEASE-NOTES
===================================================================
RCS file: /cvsroot/wikipedia/phase3/RELEASE-NOTES,v
retrieving revision 1.348
diff -u -r1.348 RELEASE-NOTES
--- RELEASE-NOTES 28 Jul 2005 07:47:49 -0000 1.348
+++ RELEASE-NOTES 29 Jul 2005 09:58:42 -0000
@@ -614,6 +614,8 @@
* Fix typo in undefined array index access prevention
* (bug 2947) Update namespaces for sr localization
* (bug 2952) Added Asturian language file with translated namespaces
+* (bug 2676) Apply a protective transformation on editing input/output
+ for browsers that hit the Unicode blacklist. Patch by plugwash.
=== Caveats ===
Index: includes/EditPage.php
===================================================================
RCS file: /cvsroot/wikipedia/phase3/includes/EditPage.php,v
retrieving revision 1.196
diff -u -r1.196 EditPage.php
--- includes/EditPage.php 25 Jul 2005 07:00:20 -0000 1.196
+++ includes/EditPage.php 29 Jul 2005 09:58:43 -0000
@@ -211,8 +211,8 @@
# These fields need to be checked for encoding.
# Also remove trailing whitespace, but don't remove _initial_
# whitespace from the text boxes. This may be significant formatting.
- $this->textbox1 = rtrim( $request->getText( 'wpTextbox1' ) );
- $this->textbox2 = rtrim( $request->getText( 'wpTextbox2' ) );
+ $this->textbox1 = $this->safeUnicodeInput( $request, 'wpTextbox1' );
+ $this->textbox2 = $this->safeUnicodeInput( $request, 'wpTextbox2' );
$this->mMetaData = rtrim( $request->getText( 'metadata' ) );
$this->summary = $request->getText( 'wpSummary' );
@@ -699,6 +699,9 @@
}
else $metadata = "" ;
+ $safemodehtml = $this->checkUnicodeCompliantBrowser()
+ ? ""
+ : "\n";
$wgOut->addHTML( <<
END
-. htmlspecialchars( $wgContLang->recodeForEdit( $this->textbox1 ) ) .
+. htmlspecialchars( $this->safeUnicodeOutput( $this->textbox1 ) ) .
"
{$metadata}
{$editsummary}
{$checkboxhtml}
+{$safemodehtml}
addWikiText( '==' . wfMsg( "yourtext" ) . '==' );
$wgOut->addHTML( "" );
}
@@ -1162,6 +1166,122 @@
return '' . $difftext . '
';
}
+ /**
+ * Filter an input field through a Unicode de-armoring process if it
+ * came from an old browser with known broken Unicode editing issues.
+ *
+ * @param WebRequest $request
+ * @param string $field
+ * @return string
+ * @access private
+ */
+ function safeUnicodeInput( $request, $field ) {
+ $text = rtrim( $request->getText( $field ) );
+ return $request->getBool( 'safemode' )
+ ? $this->unmakesafe( $text )
+ : $text;
+ }
+
+ /**
+ * Filter an output field through a Unicode de-armoring process if it
+ * came from an old browser with known broken Unicode editing issues.
+ *
+ * @param string $text
+ * @return string
+ * @access private
+ */
+ function safeUnicodeOutput( $text ) {
+ global $wgContLang;
+ $codedText = $wgContLang->recodeForEdit( $text );
+ return $this->checkUnicodeCompliantBrowser()
+ ? $codedText
+ : $this->makesafe( $codedText );
+ }
+
+ /**
+ * A number of web browsers are known to corrupt non-ASCII characters
+ * in a UTF-8 text editing environment. To protect against this,
+ * detected browsers will be served an armored version of the text,
+ * with non-ASCII chars converted to numeric HTML character references.
+ *
+ * Preexisting such character references will have a 0 added to them
+ * to ensure that round-trips do not alter the original data.
+ *
+ * @param string $invalue
+ * @return string
+ * @access private
+ */
+ function makesafe( $invalue ) {
+ // Armor existing references for reversability.
+ $invalue = strtr( $invalue, array( "" => "" ) );
+
+ $bytesleft = 0;
+ $result = "";
+ $working = 0;
+ for( $i = 0; $i < strlen( $invalue ); $i++ ) {
+ $bytevalue = ord( $invalue{$i} );
+ if( $bytevalue <= 0x7F ) { //0xxx xxxx
+ $result .= chr( $bytevalue );
+ $bytesleft = 0;
+ } elseif( $bytevalue <= 0xBF ) { //10xx xxxx
+ $working = $working << 6;
+ $working += ($bytevalue & 0x3F);
+ $bytesleft--;
+ if( $bytesleft <= 0 ) {
+ $result .= "" . strtoupper( dechex( $working ) ) . ";";
+ }
+ } elseif( $bytevalue <= 0xDF ) { //110x xxxx
+ $working = $bytevalue & 0x1F;
+ $bytesleft = 1;
+ } elseif( $bytevalue <= 0xEF ) { //1110 xxxx
+ $working = $bytevalue & 0x0F;
+ $bytesleft = 2;
+ } else { //1111 0xxx
+ $working = $bytevalue & 0x07;
+ $bytesleft = 3;
+ }
+ }
+ return $result;
+ }
+
+ /**
+ * Reverse the previously applied transliteration of non-ASCII characters
+ * back to UTF-8. Used to protect data from corruption by broken web browsers
+ * as listed in $wgBrowserBlackList.
+ *
+ * @param string $invalue
+ * @return string
+ * @access private
+ */
+ function unmakesafe( $invalue ) {
+ $result = "";
+ for( $i = 0; $i < strlen( $invalue ); $i++ ) {
+ if( ( substr( $invalue, $i, 3 ) == "" ) && ( $invalue{$i+3} != '0' ) ) {
+ $i += 3;
+ $hexstring = "";
+ do {
+ $hexstring .= $invalue{$i};
+ $i++;
+ } while( ctype_xdigit( $invalue{$i} ) && ( $i < strlen( $invalue ) ) );
+
+ // Do some sanity checks. These aren't needed for reversability,
+ // but should help keep the breakage down if the editor
+ // breaks one of the entities whilst editing.
+ if ((substr($invalue,$i,1)==";") and (strlen($hexstring) <= 6)) {
+ $codepoint = hexdec($hexstring);
+ $result .= codepointToUtf8( $codepoint );
+ } else {
+ $result .= "" . $hexstring . substr( $invalue, $i, 1 );
+ }
+ } else {
+ $result .= substr( $invalue, $i, 1 );
+ }
+ }
+ // reverse the transform that we made for reversability reasons.
+ return strtr( $result, array( "" => "" ) );
+ }
+
+
}
?>
Index: languages/Language.php
===================================================================
RCS file: /cvsroot/wikipedia/phase3/languages/Language.php,v
retrieving revision 1.630
diff -u -r1.630 Language.php
--- languages/Language.php 26 Jul 2005 04:23:22 -0000 1.630
+++ languages/Language.php 29 Jul 2005 09:58:44 -0000
@@ -730,7 +730,7 @@
press \"Save page\".
",
'yourtext' => 'Your text',
'storedversion' => 'Stored version',
-'nonunicodebrowser' => "WARNING: Your browser is not unicode compliant, please change it before editing an article.",
+'nonunicodebrowser' => "WARNING: Your browser is not unicode compliant. A workaround is in place to allow you to safely edit articles: non-ASCII characters will appear in the edit box as hexadecimal codes.",
'editingold' => "WARNING: You are editing an out-of-date
revision of this page.
If you save it, any changes made since this revision will be lost.",