From b2bf756c1483c0dc65110bdd1b63710e9cdd6edc Mon Sep 17 00:00:00 2001 From: csteipp Date: Wed, 5 Nov 2014 15:42:20 -0800 Subject: [PATCH] SECURITY: Properly remove html from conversion text When converting a text to a variant, html should not be converted. This patch parses the text as html5, and protects and html from translation. Change-Id: I268fdb9be3c9f7f020aab3a0200db6b7a0beddaa --- includes/AutoLoader.php | 1 + includes/Html5Tokenizer.php | 1364 +++++++++++++++++++++++++ languages/LanguageConverter.php | 89 +- tests/phpunit/includes/Html5TokenizerTest.php | 132 +++ 4 files changed, 1559 insertions(+), 27 deletions(-) create mode 100644 includes/Html5Tokenizer.php create mode 100644 tests/phpunit/includes/Html5TokenizerTest.php diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php index 172bd49..460730a 100644 --- a/includes/AutoLoader.php +++ b/includes/AutoLoader.php @@ -82,6 +82,7 @@ $wgAutoloadLocalClasses = array( 'HistoryBlobStub' => 'includes/HistoryBlob.php', 'Hooks' => 'includes/Hooks.php', 'Html' => 'includes/Html.php', + 'Html5Tokenizer' => 'includes/Html5Tokenizer.php', 'HtmlFormatter' => 'includes/HtmlFormatter.php', 'HTMLApiField' => 'includes/htmlform/HTMLApiField.php', 'HTMLAutoCompleteSelectField' => 'includes/htmlform/HTMLAutoCompleteSelectField.php', diff --git a/includes/Html5Tokenizer.php b/includes/Html5Tokenizer.php new file mode 100644 index 0000000..e551c44 --- /dev/null +++ b/includes/Html5Tokenizer.php @@ -0,0 +1,1364 @@ + + * Copyright 2008 Edward Z. Yang + * Copyright 2009 Geoffrey Sneddon + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * @file + */ + +// In general: +// /* */ indicates verbatim text from the HTML 5 specification +// // indicates regular comments + +class Html5Tokenizer { + + /** + * HTML5_InputStream the stream we parse to find each chunk of html + */ + private $stream; + + /** + * index into the data where the next html chunk starts + */ + public $elementStart; + + /** + * The next html chunk + */ + public $element; + + /** + * Array of elements where we return the entire + * content. Usually array( 'pre', 'code', 'style', 'script' ); + */ + private $entireElements; + + /** + * Do we return /^[^<]*>/ as an html token? + */ + private $flagCloseOnly; + + /** + * Do we return /<[^>]*$/ as an html token? + */ + private $flagOpenOnly; + + /** + * Current token that is being built, but not yet emitted. Also + * is the last token emitted, if applicable. + */ + protected $token; + + // These are constants describing tokens + const DOCTYPE = 0; + const STARTTAG = 1; + const ENDTAG = 2; + const COMMENT = 3; + const CHARACTER = 4; + const SPACECHARACTER = 5; + const EOF = 6; + const PARSEERROR = 7; + + // These are constants representing bunches of characters. + const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'; + const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; + const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz'; + const DIGIT = '0123456789'; + const HEX = '0123456789ABCDEFabcdef'; + const WHITESPACE = "\t\n\x0c "; + + /** + * @param string $data the html string to tokenize + * @param array list of elements names to get the entire contents of + */ + public function __construct( $data, array $entireElements = array() ) { + $this->stream = new HTML5_InputStream( $data ); + $this->entireElements = $entireElements; + $this->flagCloseOnly = false; + $this->flagOpenOnly = false; + } + + /** + * @param bool $flag whether to match ^[^<]*> + */ + public function setFlagCloseOnly( $flag ) { + $this->flagCloseOnly = $flag; + } + + /** + * @param bool $flag whether to match <[^>]*$ + */ + public function setFlagOpenOnly( $flag ) { + $this->flagOpenOnly = $flag; + } + + /** + * @return array + */ + public function getLastToken() { + return $this->token; + } + + + public function checkEntireElementMatching( $matchingEntireElement ) { + $sc = isset( $this->token['self-closing'] ) ? $this->token['self-closing'] : false; + return !$matchingEntireElement || ( + ( $this->token['type'] === self::ENDTAG || $sc ) + && $this->token['name'] === $matchingEntireElement + ); + } + + /** + * Performs the actual parsing of the document. Each call will return + * the next chunk of html in the string. We only handle PCDATA content model. + * + * Access the html chunk and it's offset in the string by this public $element + * and $elementStart members of this class. + * + * @return bool true of we identified a chunk of html in the remaining string + */ + public function parse() { + // Current state + $state = 'data'; + + // This is used to avoid having to have look-behind in the data state. + $lastFourChars = ''; + + /** + * Escape flag as specified by the HTML5 specification: "used to + * control the behavior of the tokeniser. It is either true or + * false, and initially must be set to the false state." + */ + $escape = false; + + // Have we started marking an html tag to return? + $haveElement = false; + + // Are we matching the entire body of a specific element? + $matchingEntireElement = false; + + // In case we need to handle flagCloseOnly + $this->elementStart = $this->stream->getPos(); + + $this->element = null; + + while( $state !== null ) { + + switch($state) { + + case 'data': + /* Consume the next input character */ + $char = $this->stream->char(); + $lastFourChars .= $char; + if ( strlen( $lastFourChars ) > 4 ) { + $lastFourChars = substr( $lastFourChars, -4 ); + } + + /* U+003C LESS-THAN SIGN (<) */ + if ( $char === '<' ) { + /* When the content model flag is set to the PCDATA state: switch + to the tag open state. */ + $state = 'tag open'; + if ( !$matchingEntireElement ) { + $this->elementStart = $this->stream->getPos() - 1; + $haveElement = true; + } + + } elseif ( $char === '>' + && !$haveElement + && $this->flagCloseOnly + ) { + // For MediaWiki, and unopened closing tag could mean + // the begining of this fragment is in an element context + if ( !$matchingEntireElement ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + /* EOF : Emit an end-of-file token. */ + $state = null; + if ( $haveElement && $this->flagOpenOnly + || $matchingEntireElement + ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + return false; + + } elseif ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + // Directly after emitting a token you switch back to the "data + // state". At that point spaceCharacters are important so they are + // emitted separately. + $chars = $this->stream->charsWhile( self::WHITESPACE ); + $lastFourChars .= $chars; + if ( strlen( $lastFourChars ) > 4 ) { + $lastFourChars = substr( $lastFourChars, -4 ); + } + + } else { + /* Anything else + THIS IS AN OPTIMIZATION: Get as many character that + otherwise would also be treated as a character token and emit it + as a single character token. Stay in the data state. */ + $chars = $this->stream->charsUntil( '<>' ); + $lastFourChars .= $chars; + if ( strlen( $lastFourChars ) > 4 ) { + $lastFourChars = substr( $lastFourChars, -4 ); + } + $state = 'data'; + } + break; + + case 'tag open': + $char = $this->stream->char(); + + if ( $char === '!' ) { + /* U+0021 EXCLAMATION MARK (!) + Switch to the markup declaration open state. */ + $state = 'markup declaration open'; + + } elseif ( $char === '/' ) { + /* U+002F SOLIDUS (/) + Switch to the close tag open state. */ + $state = 'close tag open'; + + } elseif( 'A' <= $char && $char <= 'Z' ) { + /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z + Create a new start tag token, set its tag name to the lowercase + version of the input character (add 0x0020 to the character's code + point), then switch to the tag name state. (Don't emit the token + yet; further details will be filled in before it is emitted.) */ + $this->token = array( + 'name' => strtolower( $char ), + 'type' => self::STARTTAG, + 'attr' => array() + ); + + $state = 'tag name'; + + } elseif( 'a' <= $char && $char <= 'z' ) { + /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z + Create a new start tag token, set its tag name to the input + character, then switch to the tag name state. (Don't emit + the token yet; further details will be filled in before it + is emitted.) */ + $this->token = array( + 'name' => $char, + 'type' => self::STARTTAG, + 'attr' => array() + ); + + $state = 'tag name'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit a U+003C LESS-THAN SIGN character token and a + U+003E GREATER-THAN SIGN character token. Switch to the data state. */ + // For MediaWiki, we don't care about returning "<>" + $state = 'data'; + + } elseif ( $char === '?' ) { + /* U+003F QUESTION MARK (?) + Parse error. Switch to the bogus comment state. */ + $this->token = array( + 'data' => '?', + 'type' => self::COMMENT + ); + $state = 'bogus comment'; + + } else { + /* Anything else + Parse error. Emit a U+003C LESS-THAN SIGN character token and + reconsume the current input character in the data state. */ + $state = 'data'; + $this->stream->unget(); + } + break; + + case 'close tag open': + $char = $this->stream->char(); + + if ( 'A' <= $char && $char <= 'Z' ) { + /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z + Create a new end tag token, set its tag name to the lowercase version + of the input character (add 0x0020 to the character's code point), then + switch to the tag name state. (Don't emit the token yet; further details + will be filled in before it is emitted.) */ + $this->token = array( + 'name' => strtolower( $char ), + 'type' => self::ENDTAG + ); + $state = 'tag name'; + + } elseif ( 'a' <= $char && $char <= 'z' ) { + /* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z + Create a new end tag token, set its tag name to the + input character, then switch to the tag name state. + (Don't emit the token yet; further details will be + filled in before it is emitted.) */ + $this->token = array( + 'name' => $char, + 'type' => self::ENDTAG + ); + $state = 'tag name'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Switch to the data state. */ + // e.g., . For MediaWiki, we want to return this + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + $this->stream->unget(); + $state = 'data'; + + } else { + /* Parse error. Switch to the bogus comment state. */ + $this->token = array( + 'data' => $char, + 'type' => self::COMMENT + ); + $state = 'bogus comment'; + } + break; + + case 'tag name': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $state = 'before attribute name'; + + } elseif ( $char === '/' ) { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'self-closing start tag'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( 'A' <= $char && $char <= 'Z' ) { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Append the lowercase version of the current input + character (add 0x0020 to the character's code point) to + the current tag token's tag name. Stay in the tag name state. */ + $chars = $this->stream->charsWhile( self::UPPER_ALPHA ); + $this->token['name'] .= strtolower($char . $chars); + $state = 'tag name'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current tag token's tag name. + Stay in the tag name state. */ + $chars = $this->stream->charsUntil( "\t\n\x0C />" . self::UPPER_ALPHA ); + $this->token['name'] .= $char . $chars; + $state = 'tag name'; + } + + if ( $this->token['name'] && in_array( $this->token['name'], $this->entireElements ) ) { + $matchingEntireElement = $this->token['name']; + } + break; + + case 'before attribute name': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute name state. */ + $state = 'before attribute name'; + + } elseif ( $char === '/' ) { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'self-closing start tag'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ('A' <= $char && $char <= 'Z' ) { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Start a new attribute in the current tag token. Set that + attribute's name to the lowercase version of the current + input character (add 0x0020 to the character's code + point), and its value to the empty string. Switch to the + attribute name state.*/ + $this->token['attr'][] = array( + 'name' => strtolower( $char ), + 'value' => '' + ); + + $state = 'attribute name'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + U+003C LESS-THAN SIGN (<) + U+003D EQUALS SIGN (=) + Parse error. Treat it as per the "anything else" entry + below. + + Anything else + Start a new attribute in the current tag token. Set that attribute's + name to the current input character, and its value to the empty string. + Switch to the attribute name state. */ + $this->token['attr'][] = array( + 'name' => $char, + 'value' => '' + ); + + $state = 'attribute name'; + } + break; + + case 'attribute name': + /* Consume the next input character: */ + $char = $this->stream->char(); + + // this conditional is optimized, check bottom + if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the after attribute name state. */ + $state = 'after attribute name'; + + } elseif ( $char === '/' ) { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'self-closing start tag'; + + } elseif ( $char === '=' ) { + /* U+003D EQUALS SIGN (=) + Switch to the before attribute value state. */ + $state = 'before attribute value'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( 'A' <= $char && $char <= 'Z' ) { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Append the lowercase version of the current input + character (add 0x0020 to the character's code point) to + the current attribute's name. Stay in the attribute name + state. */ + $chars = $this->stream->charsWhile( self::UPPER_ALPHA ); + $last = count( $this->token['attr'] ) - 1; + $this->token['attr'][$last]['name'] .= strtolower( $char . $chars ); + + $state = 'attribute name'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + U+003C LESS-THAN SIGN (<) + Parse error. Treat it as per the "anything else" + entry below. + + Anything else + Append the current input character to the current attribute's name. + Stay in the attribute name state. */ + $chars = $this->stream->charsUntil( "\t\n\x0C /=>\"'" . self::UPPER_ALPHA ); + $last = count( $this->token['attr'] ) - 1; + $this->token['attr'][$last]['name'] .= $char . $chars; + + $state = 'attribute name'; + } + + break; + + case 'after attribute name': + // Consume the next input character: + $char = $this->stream->char(); + + // this is an optimized conditional, check the bottom + if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the after attribute name state. */ + $state = 'after attribute name'; + + } elseif ( $char === '/' ) { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'self-closing start tag'; + + } elseif ( $char === '=' ) { + /* U+003D EQUALS SIGN (=) + Switch to the before attribute value state. */ + $state = 'before attribute value'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( 'A' <= $char && $char <= 'Z' ) { + /* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z + Start a new attribute in the current tag token. Set that + attribute's name to the lowercase version of the current + input character (add 0x0020 to the character's code + point), and its value to the empty string. Switch to the + attribute name state. */ + $this->token['attr'][] = array( + 'name' => strtolower( $char ), + 'value' => '' + ); + $state = 'attribute name'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + U+003C LESS-THAN SIGN(<) + Parse error. Treat it as per the "anything else" + entry below. + + Anything else + Start a new attribute in the current tag token. Set that attribute's + name to the current input character, and its value to the empty string. + Switch to the attribute name state. */ + $this->token['attr'][] = array( + 'name' => $char, + 'value' => '' + ); + + $state = 'attribute name'; + } + break; + + case 'before attribute value': + // Consume the next input character: + $char = $this->stream->char(); + + // this is an optimized conditional + if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Stay in the before attribute value state. */ + $state = 'before attribute value'; + + } elseif ( $char === '"' ) { + /* U+0022 QUOTATION MARK (") + Switch to the attribute value (double-quoted) state. */ + $state = 'attribute value (double-quoted)'; + + } elseif ( $char === '&' ) { + /* U+0026 AMPERSAND (&) + Switch to the attribute value (unquoted) state and reconsume + this input character. */ + $this->stream->unget(); + $state = 'attribute value (unquoted)'; + + } elseif ($char === '\'' ) { + /* U+0027 APOSTROPHE (') + Switch to the attribute value (single-quoted) state. */ + $state = 'attribute value (single-quoted)'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* U+003D EQUALS SIGN (=) + * U+003C LESS-THAN SIGN (<) + Parse error. Treat it as per the "anything else" entry below. + + Anything else + Append the current input character to the current attribute's value. + Switch to the attribute value (unquoted) state. */ + $last = count( $this->token['attr'] ) - 1; + $this->token['attr'][$last]['value'] .= $char; + $state = 'attribute value (unquoted)'; + } + break; + + case 'attribute value (double-quoted)': + // Consume the next input character: + $char = $this->stream->char(); + + if( $char === '"' ) { + /* U+0022 QUOTATION MARK (") + Switch to the after attribute value (quoted) state. */ + $state = 'after attribute value (quoted)'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (double-quoted) state. */ + $chars = $this->stream->charsUntil( '"' ); + $last = count( $this->token['attr'] ) - 1; + $this->token['attr'][$last]['value'] .= $char . $chars; + $state = 'attribute value (double-quoted)'; + } + break; + + case 'attribute value (single-quoted)': + // Consume the next input character: + $char = $this->stream->char(); + + if( $char === "'" ) { + /* U+0022 QUOTATION MARK (') + Switch to the after attribute value state. */ + $state = 'after attribute value (quoted)'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (single-quoted) state. */ + $chars = $this->stream->charsUntil( "'" ); + $last = count( $this->token['attr'] ) - 1; + $this->token['attr'][$last]['value'] .= $char . $chars; + $state = 'attribute value (single-quoted)'; + } + break; + + case 'attribute value (unquoted)': + // Consume the next input character: + $char = $this->stream->char(); + + if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $state = 'before attribute name'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ($char === false) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + } else { + /* U+0022 QUOTATION MARK (") + U+0027 APOSTROPHE (') + U+003C LESS-THAN SIGN (<) + U+003D EQUALS SIGN (=) + Parse error. Treat it as per the "anything else" + entry below. + + Anything else + Append the current input character to the current attribute's value. + Stay in the attribute value (unquoted) state. */ + $chars = $this->stream->charsUntil( "\t\n\x0c &>\"'=" ); + + $last = count( $this->token['attr'] ) - 1; + $this->token['attr'][$last]['value'] .= $char . $chars; + + $state = 'attribute value (unquoted)'; + } + break; + + case 'after attribute value (quoted)': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) { + /* U+0009 CHARACTER TABULATION + U+000A LINE FEED (LF) + U+000C FORM FEED (FF) + U+0020 SPACE + Switch to the before attribute name state. */ + $state = 'before attribute name'; + + } elseif ( $char === '/' ) { + /* U+002F SOLIDUS (/) + Switch to the self-closing start tag state. */ + $state = 'self-closing start tag'; + + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the current tag token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Parse error. Reconsume the character in the before attribute + name state. */ + $this->stream->unget(); + $state = 'before attribute name'; + } + break; + + case 'self-closing start tag': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Set the self-closing flag of the current tag token. + Emit the current tag token. Switch to the data state. */ + // not sure if this is the name we want + $this->token['self-closing'] = true; + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + /* EOF + Parse error. Reconsume the EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Parse error. Reconsume the character in the before attribute name state. */ + $this->stream->unget(); + $state = 'before attribute name'; + } + break; + + case 'bogus comment': + /* Consume every character up to the first U+003E GREATER-THAN SIGN + character (>) or the end of the file (EOF), whichever comes first. Emit + a comment token whose data is the concatenation of all the characters + starting from and including the character that caused the state machine + to switch into the bogus comment state, up to and including the last + consumed character before the U+003E character, if any, or up to the + end of the file otherwise. (If the comment was started by the end of + the file (EOF), the token is empty.) */ + $this->token['data'] .= (string) $this->stream->charsUntil( '>' ); + $this->stream->char(); + + /* Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + break; + + case 'markup declaration open': + // Consume for below + $hyphens = $this->stream->charsWhile( '-', 2 ); + if ( $hyphens === '-' ) { + $this->stream->unget(); + } + if ( $hyphens !== '--' ) { + $alpha = $this->stream->charsWhile( self::ALPHA, 7 ); + } + + /* If the next two characters are both U+002D HYPHEN-MINUS (-) + characters, consume those two characters, create a comment token whose + data is the empty string, and switch to the comment state. */ + if ( $hyphens === '--' ) { + $state = 'comment start'; + $this->token = array( + 'data' => '', + 'type' => self::COMMENT + ); + + /* Otherwise if the next seven characters are a case-insensitive match + for the word "DOCTYPE", then consume those characters and switch to the + DOCTYPE state. */ + } elseif ( strtoupper( $alpha ) === 'DOCTYPE' ) { + # $state = 'DOCTYPE'; + // For MediaWiki, we're simplifying and saying DOCTYPE + // is just another self-closing tag + $state = 'attribute name'; + $this->token = array( + 'name' => '!DOCTYPE', + 'data' => '', + 'type' => self::STARTTAG, + 'self-closing' => true, + ); + + // XXX not implemented + /* Otherwise, if the insertion mode is "in foreign content" + and the current node is not an element in the HTML namespace + and the next seven characters are an ASCII case-sensitive + match for the string "[CDATA[" (the five uppercase letters + "CDATA" with a U+005B LEFT SQUARE BRACKET character before + and after), then consume those characters and switch to the + CDATA section state (which is unrelated to the content model + flag's CDATA state). + + Otherwise, is is a parse error. Switch to the bogus comment state. + The next character that is consumed, if any, is the first character + that will be in the comment. */ + } else { + $this->token = array( + 'data' => (string) $alpha, + 'type' => self::COMMENT + ); + $state = 'bogus comment'; + } + break; + + case 'comment start': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if ( $char === '-' ) { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment start dash state. */ + $state = 'comment start dash'; + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit the comment token. Switch to the + data state. */ + // E.g., . For MediaWiki we should return this + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + /* EOF + Parse error. Emit the comment token. Reconsume the + EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + } else { + /* Anything else + Append the input character to the comment token's + data. Switch to the comment state. */ + $this->token['data'] .= $char; + $state = 'comment'; + } + break; + + case 'comment start dash': + /* Consume the next input character: */ + $char = $this->stream->char(); + if ( $char === '-' ) { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment end state */ + $state = 'comment end'; + } elseif ( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Parse error. Emit the comment token. Switch to the + data state. */ + // E.g., . For MediaWiki, we return this + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === false ) { + /* Parse error. Emit the comment token. Reconsume the + EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + $this->token['data'] .= '-' . $char; + $state = 'comment'; + } + break; + + case 'comment': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if( $char === '-' ) { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment end dash state */ + $state = 'comment end dash'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Emit the comment token. Reconsume the EOF character + in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append the input character to the comment token's data. Stay in + the comment state. */ + $chars = $this->stream->charsUntil( '-' ); + $this->token['data'] .= $char . $chars; + } + break; + + case 'comment end dash': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if( $char === '-' ) { + /* U+002D HYPHEN-MINUS (-) + Switch to the comment end state */ + $state = 'comment end'; + + } elseif( $char === false ) { + /* EOF + Parse error. Emit the comment token. Reconsume the EOF character + in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Append a U+002D HYPHEN-MINUS (-) character and the input + character to the comment token's data. Switch to the comment state. */ + $this->token['data'] .= '-' . $char; + $state = 'comment'; + } + break; + + case 'comment end': + /* Consume the next input character: */ + $char = $this->stream->char(); + + if( $char === '>' ) { + /* U+003E GREATER-THAN SIGN (>) + Emit the comment token. Switch to the data state. */ + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + + } elseif ( $char === '-' ) { + /* U+002D HYPHEN-MINUS (-) + Parse error. Append a U+002D HYPHEN-MINUS (-) character + to the comment token's data. Stay in the comment end + state. */ + $this->token['data'] .= '-'; + + } elseif ( $char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ' ) { + $this->token['data'] .= '--' . $char; + $state = 'comment end space'; + + } elseif ( $char === '!' ) { + + $state = 'comment end bang'; + + } elseif ( $char === false ) { + /* EOF + Parse error. Emit the comment token. Reconsume the + EOF character in the data state. */ + $this->stream->unget(); + $state = 'data'; + + } else { + /* Anything else + Parse error. Append two U+002D HYPHEN-MINUS (-) + characters and the input character to the comment token's + data. Switch to the comment state. */ + + $this->token['data'] .= '--' . $char; + $state = 'comment'; + } + break; + + case 'comment end bang': + $char = $this->stream->char(); + if ( $char === '>' ) { + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + } elseif ( $char === "-" ) { + $this->token['data'] .= '--!'; + $state = 'comment end dash'; + } elseif ( $char === false ) { + $this->stream->unget(); + $state = 'data'; + } else { + $this->token['data'] .= '--!' . $char; + $state = 'comment'; + } + break; + + case 'comment end space': + $char = $this->stream->char(); + if ( $char === '>' ) { + $state = 'data'; + if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) { + $this->element = $this->stream->getSubstr( + $this->elementStart, + $this->stream->getPos() + ); + return true; + } + } elseif ( $char === '-' ) { + $state = 'comment end dash'; + } elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') { + $this->token['data'] .= $char; + } elseif ($char === false) { + $this->stream->unget(); + $state = 'data'; + } else { + $this->token['data'] .= $char; + $state = 'comment'; + } + break; + + // case 'cdataSection': + + } + } + } + +} + + + +/* + +Copyright 2009 Geoffrey Sneddon + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +*/ + +class HTML5_InputStream { + /** + * The string data we're parsing. + */ + private $data; + + /** + * The current integer byte position we are in $data + */ + private $char; + + /** + * Length of $data; when $char === $data, we are at the end-of-file. + */ + private $EOF; + + /** + * Parse errors. + */ + public $errors = array(); + + /** + * @param $data Data to parse + */ + public function __construct( $data ) { + + /* One leading U+FEFF BYTE ORDER MARK character must be + ignored if any are present. */ + if ( substr( $data, 0, 3 ) === "\xEF\xBB\xBF" ) { + $data = substr( $data, 3 ); + } + + /* All U+0000 NULL characters in the input must be replaced + by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such + characters is a parse error. + + U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED + (LF) characters are treated specially. Any CR characters + that are followed by LF characters must be removed, and any + CR characters not followed by LF characters must be converted + to LF characters. Thus, newlines in HTML DOMs are represented + by LF characters, and there are never any CR characters in the + input to the tokenization stage. */ + $data = str_replace( + array( + "\0", + "\r\n", + "\r" + ), + array( + "\xEF\xBF\xBD", + "\n", + "\n" + ), + $data + ); + + $this->data = $data; + $this->char = 0; + $this->EOF = strlen( $data ); + } + + + /** + * Retrieve the currently consume character. + * @note This performs bounds checking + */ + public function char() { + return ( $this->char++ < $this->EOF ) + ? $this->data[$this->char - 1] + : false; + } + + /** + * Matches as far as possible until we reach a certain set of bytes + * and returns the matched substring. + * @param $bytes Bytes to match. + */ + public function charsUntil( $bytes, $max = null ) { + if ( $this->char < $this->EOF ) { + if ( $max === 0 || $max ) { + $len = strcspn( $this->data, $bytes, $this->char, $max ); + } else { + $len = strcspn( $this->data, $bytes, $this->char ); + } + $string = (string) substr( $this->data, $this->char, $len ); + $this->char += $len; + return $string; + } else { + return false; + } + } + + /** + * Matches as far as possible with a certain set of bytes + * and returns the matched substring. + * @param $bytes Bytes to match. + */ + public function charsWhile( $bytes, $max = null ) { + if ( $this->char < $this->EOF ) { + if ( $max === 0 || $max ) { + $len = strspn( $this->data, $bytes, $this->char, $max ); + } else { + $len = strspn( $this->data, $bytes, $this->char ); + } + $string = (string) substr( $this->data, $this->char, $len ); + $this->char += $len; + return $string; + } else { + return false; + } + } + + /** + * Unconsume one character. + */ + public function unget() { + if ( $this->char <= $this->EOF ) { + $this->char--; + } + } + + /** + * Get the current pointer + * @return int + */ + public function getPos() { + return $this->char; + } + + /** + * Get the current pointer + * @param int pinter into the data + */ + public function setPos( $ndx ) { + $this->char = $ndx; + } + + /** + * Get a substring of the data. + * @param int start + * @param int end + * @return string + */ + public function getSubstr( $start, $end ) { + if ( $end < $start ) { + throw new Exception( 'End was before start?' ); + } + $length = $end - $start; + return substr( $this->data, $start, $length ); + } + +} diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php index eae77fb..0c7809f 100644 --- a/languages/LanguageConverter.php +++ b/languages/LanguageConverter.php @@ -353,55 +353,66 @@ class LanguageConverter { return $text; } - /* we convert everything except: + /* Do the conversion. We convert everything except: 1. HTML markups (anything between < and >) 2. HTML entities 3. placeholders created by the parser */ + + // Get regex for parser placeholders global $wgParser; + $marker = false; if ( isset( $wgParser ) && $wgParser->UniqPrefix() != '' ) { - $marker = '|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+'; - } else { - $marker = ''; + $marker = '/' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+/s'; } - // this one is needed when the text is inside an HTML markup - $htmlfix = '|<[^>]+$|^[^<>]*>'; - - // disable convert to variants between tags - $codefix = '.+?<\/code>|'; - // disable conversion of