From b2bf756c1483c0dc65110bdd1b63710e9cdd6edc Mon Sep 17 00:00:00 2001
From: csteipp <csteipp@wikimedia.org>
Date: Wed, 5 Nov 2014 15:42:20 -0800
Subject: [PATCH] SECURITY: Properly remove html from conversion text

When converting a text to a variant, html should not be converted. This
patch parses the text as html5, and protects and html from translation.

Change-Id: I268fdb9be3c9f7f020aab3a0200db6b7a0beddaa
---
 includes/AutoLoader.php                       |    1 +
 includes/Html5Tokenizer.php                   | 1364 +++++++++++++++++++++++++
 languages/LanguageConverter.php               |   89 +-
 tests/phpunit/includes/Html5TokenizerTest.php |  132 +++
 4 files changed, 1559 insertions(+), 27 deletions(-)
 create mode 100644 includes/Html5Tokenizer.php
 create mode 100644 tests/phpunit/includes/Html5TokenizerTest.php

diff --git a/includes/AutoLoader.php b/includes/AutoLoader.php
index 172bd49..460730a 100644
--- a/includes/AutoLoader.php
+++ b/includes/AutoLoader.php
@@ -82,6 +82,7 @@ $wgAutoloadLocalClasses = array(
 	'HistoryBlobStub' => 'includes/HistoryBlob.php',
 	'Hooks' => 'includes/Hooks.php',
 	'Html' => 'includes/Html.php',
+	'Html5Tokenizer' => 'includes/Html5Tokenizer.php',
 	'HtmlFormatter' => 'includes/HtmlFormatter.php',
 	'HTMLApiField' => 'includes/htmlform/HTMLApiField.php',
 	'HTMLAutoCompleteSelectField' => 'includes/htmlform/HTMLAutoCompleteSelectField.php',
diff --git a/includes/Html5Tokenizer.php b/includes/Html5Tokenizer.php
new file mode 100644
index 0000000..e551c44
--- /dev/null
+++ b/includes/Html5Tokenizer.php
@@ -0,0 +1,1364 @@
+<?php
+/**
+ * HTML Tokenizer for %MediaWiki. Parses a string according to the html 5
+ * tokenizing spec [http://dev.w3.org/html5/spec-preview/tokenization.html],
+ * except for when we could optimize for this specific MediaWiki task.
+ *
+ * This is based heavily on the html5lib-php project
+ * (https://github.com/html5lib/html5lib-php), licensed as:
+ *
+ * Copyright 2007 Jeroen van der Meer <http://jero.net/>
+ * Copyright 2008 Edward Z. Yang <http://htmlpurifier.org/>
+ * Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * @file
+ */
+
+// In general:
+// /* */ indicates verbatim text from the HTML 5 specification
+// // indicates regular comments
+
+class Html5Tokenizer {
+
+	/**
+	 * HTML5_InputStream the stream we parse to find each chunk of html
+	 */
+	private $stream;
+
+	/**
+	 * index into the data where the next html chunk starts
+	 */
+	public $elementStart;
+
+	/**
+	 * The next html chunk
+	 */
+	public $element;
+
+	/**
+	 * Array of elements where we return the entire
+	 * content. Usually array( 'pre', 'code', 'style', 'script' );
+	 */
+	private $entireElements;
+
+	/**
+	 * Do we return /^[^<]*>/ as an html token?
+	 */
+	private $flagCloseOnly;
+
+	/**
+	 * Do we return /<[^>]*$/ as an html token?
+	 */
+	private $flagOpenOnly;
+
+	/**
+	 * Current token that is being built, but not yet emitted. Also
+	 * is the last token emitted, if applicable.
+	 */
+	protected $token;
+
+	// These are constants describing tokens
+	const DOCTYPE        = 0;
+	const STARTTAG       = 1;
+	const ENDTAG         = 2;
+	const COMMENT        = 3;
+	const CHARACTER      = 4;
+	const SPACECHARACTER = 5;
+	const EOF            = 6;
+	const PARSEERROR     = 7;
+
+	// These are constants representing bunches of characters.
+	const ALPHA        = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
+	const UPPER_ALPHA  = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
+	const LOWER_ALPHA  = 'abcdefghijklmnopqrstuvwxyz';
+	const DIGIT        = '0123456789';
+	const HEX          = '0123456789ABCDEFabcdef';
+	const WHITESPACE   = "\t\n\x0c ";
+
+	/**
+	 * @param string $data the html string to tokenize
+	 * @param array list of elements names to get the entire contents of
+	 */
+	public function __construct( $data, array $entireElements = array() ) {
+		$this->stream = new HTML5_InputStream( $data );
+		$this->entireElements = $entireElements;
+		$this->flagCloseOnly = false;
+		$this->flagOpenOnly = false;
+	}
+
+	/**
+	 * @param bool $flag whether to match ^[^<]*>
+	 */
+	public function setFlagCloseOnly( $flag ) {
+		$this->flagCloseOnly = $flag;
+	}
+
+	/**
+	 * @param bool $flag whether to match <[^>]*$
+	 */
+	public function setFlagOpenOnly( $flag ) {
+		$this->flagOpenOnly = $flag;
+	}
+
+	/**
+	 * @return array
+	 */
+	public function getLastToken() {
+		return $this->token;
+	}
+
+
+	public function checkEntireElementMatching( $matchingEntireElement ) {
+		$sc = isset( $this->token['self-closing'] ) ? $this->token['self-closing'] : false;
+		return !$matchingEntireElement || (
+				( $this->token['type'] === self::ENDTAG || $sc )
+				&& $this->token['name'] === $matchingEntireElement
+			);
+	}
+
+	/**
+	 * Performs the actual parsing of the document. Each call will return
+	 * the next chunk of html in the string. We only handle PCDATA content model.
+	 *
+	 * Access the html chunk and it's offset in the string by this public $element
+	 * and $elementStart members of this class.
+	 *
+	 * @return bool true of we identified a chunk of html in the remaining string
+	 */
+	public function parse() {
+		// Current state
+		$state = 'data';
+
+		// This is used to avoid having to have look-behind in the data state.
+		$lastFourChars = '';
+
+		/**
+		 * Escape flag as specified by the HTML5 specification: "used to
+		 * control the behavior of the tokeniser. It is either true or
+		 * false, and initially must be set to the false state."
+		 */
+		$escape = false;
+
+		// Have we started marking an html tag to return?
+		$haveElement = false;
+
+		// Are we matching the entire body of a specific element?
+		$matchingEntireElement = false;
+
+		// In case we need to handle flagCloseOnly
+		$this->elementStart = $this->stream->getPos();
+
+		$this->element = null;
+
+		while( $state !== null ) {
+
+			switch($state) {
+
+				case 'data':
+					/* Consume the next input character */
+					$char = $this->stream->char();
+					$lastFourChars .= $char;
+					if ( strlen( $lastFourChars ) > 4 ) {
+						$lastFourChars = substr( $lastFourChars, -4 );
+					}
+
+					/* U+003C LESS-THAN SIGN (<) */
+					if ( $char === '<' ) {
+						/* When the content model flag is set to the PCDATA state: switch
+						to the tag open state. */
+						$state = 'tag open';
+						if ( !$matchingEntireElement ) {
+							$this->elementStart = $this->stream->getPos() - 1;
+							$haveElement = true;
+						}
+
+					} elseif ( $char === '>'
+						&& !$haveElement
+						&& $this->flagCloseOnly
+					) {
+						// For MediaWiki, and unopened closing tag could mean
+						// the begining of this fragment is in an element context
+						if ( !$matchingEntireElement ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						/* EOF : Emit an end-of-file token. */
+						$state = null;
+						if ( $haveElement && $this->flagOpenOnly
+							|| $matchingEntireElement
+						) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+						return false;
+
+					} elseif ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						// Directly after emitting a token you switch back to the "data
+						// state". At that point spaceCharacters are important so they are
+						// emitted separately.
+						$chars = $this->stream->charsWhile( self::WHITESPACE );
+						$lastFourChars .= $chars;
+						if ( strlen( $lastFourChars ) > 4 ) {
+							$lastFourChars = substr( $lastFourChars, -4 );
+						}
+
+					} else {
+						/* Anything else
+						THIS IS AN OPTIMIZATION: Get as many character that
+						otherwise would also be treated as a character token and emit it
+						as a single character token. Stay in the data state. */
+						$chars = $this->stream->charsUntil( '<>' );
+						$lastFourChars .= $chars;
+						if ( strlen( $lastFourChars ) > 4 ) {
+							$lastFourChars = substr( $lastFourChars, -4 );
+						}
+						$state = 'data';
+					}
+				break;
+
+				case 'tag open':
+					$char = $this->stream->char();
+
+					if ( $char === '!' ) {
+						/* U+0021 EXCLAMATION MARK (!)
+						Switch to the markup declaration open state. */
+						$state = 'markup declaration open';
+
+					} elseif ( $char === '/' ) {
+						/* U+002F SOLIDUS (/)
+						Switch to the close tag open state. */
+						$state = 'close tag open';
+
+					} elseif( 'A' <= $char && $char <= 'Z' ) {
+						/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
+						Create a new start tag token, set its tag name to the lowercase
+						version of the input character (add 0x0020 to the character's code
+						point), then switch to the tag name state. (Don't emit the token
+						yet; further details will be filled in before it is emitted.) */
+						$this->token = array(
+							'name'  => strtolower( $char ),
+							'type'  => self::STARTTAG,
+							'attr'  => array()
+						);
+
+						$state = 'tag name';
+
+					} elseif( 'a' <= $char && $char <= 'z' ) {
+						/* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
+						Create a new start tag token, set its tag name to the input
+						character, then switch to the tag name state. (Don't emit
+						the token yet; further details will be filled in before it
+						is emitted.) */
+						$this->token = array(
+							'name'  => $char,
+							'type'  => self::STARTTAG,
+							'attr'  => array()
+						);
+
+						$state = 'tag name';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Parse error. Emit a U+003C LESS-THAN SIGN character token and a
+						U+003E GREATER-THAN SIGN character token. Switch to the data state. */
+						// For MediaWiki, we don't care about returning "<>"
+						$state = 'data';
+
+					} elseif ( $char === '?' ) {
+						/* U+003F QUESTION MARK (?)
+						Parse error. Switch to the bogus comment state. */
+						$this->token = array(
+							'data' => '?',
+							'type' => self::COMMENT
+						);
+						$state = 'bogus comment';
+
+					} else {
+						/* Anything else
+						Parse error. Emit a U+003C LESS-THAN SIGN character token and
+						reconsume the current input character in the data state. */
+						$state = 'data';
+						$this->stream->unget();
+					}
+				break;
+
+				case 'close tag open':
+					$char = $this->stream->char();
+
+					if ( 'A' <= $char && $char <= 'Z' ) {
+						/* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
+						Create a new end tag token, set its tag name to the lowercase version
+						of the input character (add 0x0020 to the character's code point), then
+						switch to the tag name state. (Don't emit the token yet; further details
+						will be filled in before it is emitted.) */
+						$this->token = array(
+							'name'  => strtolower( $char ),
+							'type'  => self::ENDTAG
+						);
+						$state = 'tag name';
+
+					} elseif ( 'a' <= $char && $char <= 'z' ) {
+						/* U+0061 LATIN SMALL LETTER A through to U+007A LATIN SMALL LETTER Z
+						Create a new end tag token, set its tag name to the
+						input character, then switch to the tag name state.
+						(Don't emit the token yet; further details will be
+						filled in before it is emitted.) */
+						$this->token = array(
+							'name'  => $char,
+							'type'  => self::ENDTAG
+						);
+						$state = 'tag name';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Parse error. Switch to the data state. */
+						// e.g., </>. For MediaWiki, we want to return this
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Parse error. Switch to the bogus comment state. */
+						$this->token = array(
+							'data' => $char,
+							'type' => self::COMMENT
+						);
+						$state = 'bogus comment';
+					}
+				break;
+
+				case 'tag name':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+						/* U+0009 CHARACTER TABULATION
+						U+000A LINE FEED (LF)
+						U+000C FORM FEED (FF)
+						U+0020 SPACE
+						Switch to the before attribute name state. */
+						$state = 'before attribute name';
+
+					} elseif ( $char === '/' ) {
+						/* U+002F SOLIDUS (/)
+						Switch to the self-closing start tag state. */
+						$state = 'self-closing start tag';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( 'A' <= $char && $char <= 'Z' ) {
+						/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+						Append the lowercase version of the current input
+						character (add 0x0020 to the character's code point) to
+						the current tag token's tag name. Stay in the tag name state. */
+						$chars = $this->stream->charsWhile( self::UPPER_ALPHA );
+						$this->token['name'] .= strtolower($char . $chars);
+						$state = 'tag name';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Append the current input character to the current tag token's tag name.
+						Stay in the tag name state. */
+						$chars = $this->stream->charsUntil( "\t\n\x0C />" . self::UPPER_ALPHA );
+						$this->token['name'] .= $char . $chars;
+						$state = 'tag name';
+					}
+
+					if ( $this->token['name'] && in_array( $this->token['name'], $this->entireElements ) ) {
+						$matchingEntireElement = $this->token['name'];
+					}
+				break;
+
+				case 'before attribute name':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						/* U+0009 CHARACTER TABULATION
+						U+000A LINE FEED (LF)
+						U+000C FORM FEED (FF)
+						U+0020 SPACE
+						Stay in the before attribute name state. */
+						$state = 'before attribute name';
+
+					} elseif ( $char === '/' ) {
+						/* U+002F SOLIDUS (/)
+						Switch to the self-closing start tag state. */
+						$state = 'self-closing start tag';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ('A' <= $char && $char <= 'Z' ) {
+						/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+						Start a new attribute in the current tag token. Set that
+						attribute's name to the lowercase version of the current
+						input character (add 0x0020 to the character's code
+						point), and its value to the empty string. Switch to the
+						attribute name state.*/
+						$this->token['attr'][] = array(
+							'name'  => strtolower( $char ),
+							'value' => ''
+						);
+
+						$state = 'attribute name';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* U+0022 QUOTATION MARK (")
+						   U+0027 APOSTROPHE (')
+						   U+003C LESS-THAN SIGN (<)
+						   U+003D EQUALS SIGN (=)
+						Parse error. Treat it as per the "anything else" entry
+						below.
+
+						Anything else
+						Start a new attribute in the current tag token. Set that attribute's
+						name to the current input character, and its value to the empty string.
+						Switch to the attribute name state. */
+						$this->token['attr'][] = array(
+							'name'  => $char,
+							'value' => ''
+						);
+
+						$state = 'attribute name';
+					}
+				break;
+
+				case 'attribute name':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					// this conditional is optimized, check bottom
+					if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						/* U+0009 CHARACTER TABULATION
+						U+000A LINE FEED (LF)
+						U+000C FORM FEED (FF)
+						U+0020 SPACE
+						Switch to the after attribute name state. */
+						$state = 'after attribute name';
+
+					} elseif ( $char === '/' ) {
+						/* U+002F SOLIDUS (/)
+						Switch to the self-closing start tag state. */
+						$state = 'self-closing start tag';
+
+					} elseif ( $char === '=' ) {
+						/* U+003D EQUALS SIGN (=)
+						Switch to the before attribute value state. */
+						$state = 'before attribute value';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( 'A' <= $char && $char <= 'Z' ) {
+						/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+						Append the lowercase version of the current input
+						character (add 0x0020 to the character's code point) to
+						the current attribute's name. Stay in the attribute name
+						state. */
+						$chars = $this->stream->charsWhile( self::UPPER_ALPHA );
+						$last = count( $this->token['attr'] ) - 1;
+						$this->token['attr'][$last]['name'] .= strtolower( $char . $chars );
+
+						$state = 'attribute name';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* U+0022 QUOTATION MARK (")
+						   U+0027 APOSTROPHE (')
+						   U+003C LESS-THAN SIGN (<)
+						Parse error. Treat it as per the "anything else"
+						entry below.
+
+						Anything else
+						Append the current input character to the current attribute's name.
+						Stay in the attribute name state. */
+						$chars = $this->stream->charsUntil( "\t\n\x0C /=>\"'" . self::UPPER_ALPHA );
+						$last = count( $this->token['attr'] ) - 1;
+						$this->token['attr'][$last]['name'] .= $char . $chars;
+
+						$state = 'attribute name';
+					}
+
+				break;
+
+				case 'after attribute name':
+					// Consume the next input character:
+					$char = $this->stream->char();
+
+					// this is an optimized conditional, check the bottom
+					if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						/* U+0009 CHARACTER TABULATION
+						U+000A LINE FEED (LF)
+						U+000C FORM FEED (FF)
+						U+0020 SPACE
+						Stay in the after attribute name state. */
+						$state = 'after attribute name';
+
+					} elseif ( $char === '/' ) {
+						/* U+002F SOLIDUS (/)
+						Switch to the self-closing start tag state. */
+						$state = 'self-closing start tag';
+
+					} elseif ( $char === '=' ) {
+						/* U+003D EQUALS SIGN (=)
+						Switch to the before attribute value state. */
+						$state = 'before attribute value';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( 'A' <= $char && $char <= 'Z' ) {
+						/* U+0041 LATIN CAPITAL LETTER A through to U+005A LATIN CAPITAL LETTER Z
+						Start a new attribute in the current tag token. Set that
+						attribute's name to the lowercase version of the current
+						input character (add 0x0020 to the character's code
+						point), and its value to the empty string. Switch to the
+						attribute name state. */
+						$this->token['attr'][] = array(
+							'name'  => strtolower( $char ),
+							'value' => ''
+						);
+						$state = 'attribute name';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* U+0022 QUOTATION MARK (")
+						   U+0027 APOSTROPHE (')
+						   U+003C LESS-THAN SIGN(<)
+						Parse error. Treat it as per the "anything else"
+						entry below.
+
+						Anything else
+						Start a new attribute in the current tag token. Set that attribute's
+						name to the current input character, and its value to the empty string.
+						Switch to the attribute name state. */
+						$this->token['attr'][] = array(
+							'name'  => $char,
+							'value' => ''
+						);
+
+						$state = 'attribute name';
+					}
+				break;
+
+				case 'before attribute value':
+					// Consume the next input character:
+					$char = $this->stream->char();
+
+					// this is an optimized conditional
+					if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						/* U+0009 CHARACTER TABULATION
+						U+000A LINE FEED (LF)
+						U+000C FORM FEED (FF)
+						U+0020 SPACE
+						Stay in the before attribute value state. */
+						$state = 'before attribute value';
+
+					} elseif ( $char === '"' ) {
+						/* U+0022 QUOTATION MARK (")
+						Switch to the attribute value (double-quoted) state. */
+						$state = 'attribute value (double-quoted)';
+
+					} elseif ( $char === '&' ) {
+						/* U+0026 AMPERSAND (&)
+						Switch to the attribute value (unquoted) state and reconsume
+						this input character. */
+						$this->stream->unget();
+						$state = 'attribute value (unquoted)';
+
+					} elseif ($char === '\'' ) {
+						/* U+0027 APOSTROPHE (')
+						Switch to the attribute value (single-quoted) state. */
+						$state = 'attribute value (single-quoted)';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Parse error. Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* U+003D EQUALS SIGN (=)
+						 * U+003C LESS-THAN SIGN (<)
+						Parse error. Treat it as per the "anything else" entry below.
+
+						Anything else
+						Append the current input character to the current attribute's value.
+						Switch to the attribute value (unquoted) state. */
+						$last = count( $this->token['attr'] ) - 1;
+						$this->token['attr'][$last]['value'] .= $char;
+						$state = 'attribute value (unquoted)';
+					}
+				break;
+
+				case 'attribute value (double-quoted)':
+					// Consume the next input character:
+					$char = $this->stream->char();
+
+					if( $char === '"' ) {
+						/* U+0022 QUOTATION MARK (")
+						Switch to the after attribute value (quoted) state. */
+						$state = 'after attribute value (quoted)';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Append the current input character to the current attribute's value.
+						Stay in the attribute value (double-quoted) state. */
+						$chars = $this->stream->charsUntil( '"' );
+						$last = count( $this->token['attr'] ) - 1;
+						$this->token['attr'][$last]['value'] .= $char . $chars;
+						$state = 'attribute value (double-quoted)';
+					}
+				break;
+
+				case 'attribute value (single-quoted)':
+					// Consume the next input character:
+					$char = $this->stream->char();
+
+					if( $char === "'" ) {
+						/* U+0022 QUOTATION MARK (')
+						Switch to the after attribute value state. */
+						$state = 'after attribute value (quoted)';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Append the current input character to the current attribute's value.
+						Stay in the attribute value (single-quoted) state. */
+						$chars = $this->stream->charsUntil( "'" );
+						$last = count( $this->token['attr'] ) - 1;
+						$this->token['attr'][$last]['value'] .= $char . $chars;
+						$state = 'attribute value (single-quoted)';
+					}
+				break;
+
+				case 'attribute value (unquoted)':
+					// Consume the next input character:
+					$char = $this->stream->char();
+
+					if( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						/* U+0009 CHARACTER TABULATION
+						U+000A LINE FEED (LF)
+						U+000C FORM FEED (FF)
+						U+0020 SPACE
+						Switch to the before attribute name state. */
+						$state = 'before attribute name';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ($char === false) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+					} else {
+						/* U+0022 QUOTATION MARK (")
+						   U+0027 APOSTROPHE (')
+						   U+003C LESS-THAN SIGN (<)
+						   U+003D EQUALS SIGN (=)
+						Parse error. Treat it as per the "anything else"
+						entry below.
+
+						Anything else
+						Append the current input character to the current attribute's value.
+						Stay in the attribute value (unquoted) state. */
+						$chars = $this->stream->charsUntil( "\t\n\x0c &>\"'=" );
+
+						$last = count( $this->token['attr'] ) - 1;
+						$this->token['attr'][$last]['value'] .= $char . $chars;
+
+						$state = 'attribute value (unquoted)';
+					}
+				break;
+
+				case 'after attribute value (quoted)':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if ( $char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ' ) {
+						/* U+0009 CHARACTER TABULATION
+						   U+000A LINE FEED (LF)
+						   U+000C FORM FEED (FF)
+						   U+0020 SPACE
+						Switch to the before attribute name state. */
+						$state = 'before attribute name';
+
+					} elseif ( $char === '/' ) {
+						/* U+002F SOLIDUS (/)
+						Switch to the self-closing start tag state. */
+						$state = 'self-closing start tag';
+
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the current tag token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Parse error. Reconsume the character in the before attribute
+						name state. */
+						$this->stream->unget();
+						$state = 'before attribute name';
+					}
+				break;
+
+				case 'self-closing start tag':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Set the self-closing flag of the current tag token.
+						Emit the current tag token. Switch to the data state. */
+						// not sure if this is the name we want
+						$this->token['self-closing'] = true;
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Reconsume the EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Parse error. Reconsume the character in the before attribute name state. */
+						$this->stream->unget();
+						$state = 'before attribute name';
+					}
+				break;
+
+				case 'bogus comment':
+					/* Consume every character up to the first U+003E GREATER-THAN SIGN
+					character (>) or the end of the file (EOF), whichever comes first. Emit
+					a comment token whose data is the concatenation of all the characters
+					starting from and including the character that caused the state machine
+					to switch into the bogus comment state, up to and including the last
+					consumed character before the U+003E character, if any, or up to the
+					end of the file otherwise. (If the comment was started by the end of
+					the file (EOF), the token is empty.) */
+					$this->token['data'] .= (string) $this->stream->charsUntil( '>' );
+					$this->stream->char();
+
+					/* Switch to the data state. */
+					$state = 'data';
+					if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+						$this->element = $this->stream->getSubstr(
+							$this->elementStart,
+							$this->stream->getPos()
+						);
+						return true;
+					}
+				break;
+
+				case 'markup declaration open':
+					// Consume for below
+					$hyphens = $this->stream->charsWhile( '-', 2 );
+					if ( $hyphens === '-' ) {
+						$this->stream->unget();
+					}
+					if ( $hyphens !== '--' ) {
+						$alpha = $this->stream->charsWhile( self::ALPHA, 7 );
+					}
+
+					/* If the next two characters are both U+002D HYPHEN-MINUS (-)
+					characters, consume those two characters, create a comment token whose
+					data is the empty string, and switch to the comment state. */
+					if ( $hyphens === '--' ) {
+						$state = 'comment start';
+						$this->token = array(
+							'data' => '',
+							'type' => self::COMMENT
+						);
+
+					/* Otherwise if the next seven characters are a case-insensitive match
+					for the word "DOCTYPE", then consume those characters and switch to the
+					DOCTYPE state. */
+					} elseif ( strtoupper( $alpha ) === 'DOCTYPE' ) {
+						# $state = 'DOCTYPE';
+						// For MediaWiki, we're simplifying and saying DOCTYPE
+						// is just another self-closing tag
+						$state = 'attribute name';
+						$this->token = array(
+							'name' => '!DOCTYPE',
+							'data' => '',
+							'type' => self::STARTTAG,
+							'self-closing' => true,
+						);
+
+						// XXX not implemented
+						/* Otherwise, if the insertion mode is "in foreign content"
+						and the current node is not an element in the HTML namespace
+						and the next seven characters are an ASCII case-sensitive
+						match for the string "[CDATA[" (the five uppercase letters
+						"CDATA" with a U+005B LEFT SQUARE BRACKET character before
+						and after), then consume those characters and switch to the
+						CDATA section state (which is unrelated to the content model
+						flag's CDATA state).
+
+						Otherwise, is is a parse error. Switch to the bogus comment state.
+						The next character that is consumed, if any, is the first character
+						that will be in the comment. */
+					} else {
+						$this->token = array(
+							'data' => (string) $alpha,
+							'type' => self::COMMENT
+						);
+						$state = 'bogus comment';
+					}
+				break;
+
+				case 'comment start':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if ( $char === '-' ) {
+						/* U+002D HYPHEN-MINUS (-)
+						Switch to the comment start dash state. */
+						$state = 'comment start dash';
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Parse error. Emit the comment token. Switch to the
+						data state. */
+						// E.g., <!-->. For MediaWiki we should return this
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Emit the comment token. Reconsume the
+						EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+					} else {
+						/* Anything else
+						Append the input character to the comment token's
+						data. Switch to the comment state. */
+						$this->token['data'] .= $char;
+						$state = 'comment';
+					}
+				break;
+
+				case 'comment start dash':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+					if ( $char === '-' ) {
+						/* U+002D HYPHEN-MINUS (-)
+						Switch to the comment end state */
+						$state = 'comment end';
+					} elseif ( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Parse error. Emit the comment token. Switch to the
+						data state. */
+						// E.g., <!--->. For MediaWiki, we return this
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === false ) {
+						/* Parse error. Emit the comment token. Reconsume the
+						EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						$this->token['data'] .= '-' . $char;
+						$state = 'comment';
+					}
+				break;
+
+				case 'comment':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if( $char === '-' ) {
+						/* U+002D HYPHEN-MINUS (-)
+						Switch to the comment end dash state */
+						$state = 'comment end dash';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Emit the comment token. Reconsume the EOF character
+						in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Append the input character to the comment token's data. Stay in
+						the comment state. */
+						$chars = $this->stream->charsUntil( '-' );
+						$this->token['data'] .= $char . $chars;
+					}
+				break;
+
+				case 'comment end dash':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if( $char === '-' ) {
+						/* U+002D HYPHEN-MINUS (-)
+						Switch to the comment end state  */
+						$state = 'comment end';
+
+					} elseif( $char === false ) {
+						/* EOF
+						Parse error. Emit the comment token. Reconsume the EOF character
+						in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Append a U+002D HYPHEN-MINUS (-) character and the input
+						character to the comment token's data. Switch to the comment state. */
+						$this->token['data'] .= '-' . $char;
+						$state = 'comment';
+					}
+				break;
+
+				case 'comment end':
+					/* Consume the next input character: */
+					$char = $this->stream->char();
+
+					if( $char === '>' ) {
+						/* U+003E GREATER-THAN SIGN (>)
+						Emit the comment token. Switch to the data state. */
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+
+					} elseif ( $char === '-' ) {
+						/* U+002D HYPHEN-MINUS (-)
+						Parse error. Append a U+002D HYPHEN-MINUS (-) character
+						to the comment token's data. Stay in the comment end
+						state. */
+						$this->token['data'] .= '-';
+
+					} elseif ( $char === "\t" || $char === "\n" || $char === "\x0a" || $char === ' ' ) {
+						$this->token['data'] .= '--' . $char;
+						$state = 'comment end space';
+
+					} elseif ( $char === '!' ) {
+
+						$state = 'comment end bang';
+
+					} elseif ( $char === false ) {
+						/* EOF
+						Parse error. Emit the comment token. Reconsume the
+						EOF character in the data state. */
+						$this->stream->unget();
+						$state = 'data';
+
+					} else {
+						/* Anything else
+						Parse error. Append two U+002D HYPHEN-MINUS (-)
+						characters and the input character to the comment token's
+						data. Switch to the comment state. */
+
+						$this->token['data'] .= '--' . $char;
+						$state = 'comment';
+					}
+				break;
+
+				case 'comment end bang':
+					$char = $this->stream->char();
+					if ( $char === '>' ) {
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+					} elseif ( $char === "-" ) {
+						$this->token['data'] .= '--!';
+						$state = 'comment end dash';
+					} elseif ( $char === false ) {
+						$this->stream->unget();
+						$state = 'data';
+					} else {
+						$this->token['data'] .= '--!' . $char;
+						$state = 'comment';
+					}
+				break;
+
+				case 'comment end space':
+					$char = $this->stream->char();
+					if ( $char === '>' ) {
+						$state = 'data';
+						if ( $this->checkEntireElementMatching( $matchingEntireElement ) ) {
+							$this->element = $this->stream->getSubstr(
+								$this->elementStart,
+								$this->stream->getPos()
+							);
+							return true;
+						}
+					} elseif ( $char === '-' ) {
+						$state = 'comment end dash';
+					} elseif ($char === "\t" || $char === "\n" || $char === "\x0c" || $char === ' ') {
+						$this->token['data'] .= $char;
+					} elseif ($char === false) {
+						$this->stream->unget();
+						$state = 'data';
+					} else {
+						$this->token['data'] .= $char;
+						$state = 'comment';
+					}
+				break;
+
+				// case 'cdataSection':
+
+			}
+		}
+	}
+
+}
+
+
+
+/*
+
+Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+*/
+
+class HTML5_InputStream {
+	/**
+	 * The string data we're parsing.
+	 */
+	private $data;
+
+	/**
+	 * The current integer byte position we are in $data
+	 */
+	private $char;
+
+	/**
+	 * Length of $data; when $char === $data, we are at the end-of-file.
+	 */
+	private $EOF;
+
+	/**
+	 * Parse errors.
+	 */
+	public $errors = array();
+
+	/**
+	 * @param $data Data to parse
+	 */
+	public function __construct( $data ) {
+
+		/* One leading U+FEFF BYTE ORDER MARK character must be
+		ignored if any are present. */
+		if ( substr( $data, 0, 3 ) === "\xEF\xBB\xBF" ) {
+			$data = substr( $data, 3 );
+		}
+
+		/* All U+0000 NULL characters in the input must be replaced
+		by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such
+		characters is a parse error.
+
+		U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED
+		(LF) characters are treated specially. Any CR characters
+		that are followed by LF characters must be removed, and any
+		CR characters not followed by LF characters must be converted
+		to LF characters. Thus, newlines in HTML DOMs are represented
+		by LF characters, and there are never any CR characters in the
+		input to the tokenization stage. */
+		$data = str_replace(
+			array(
+				"\0",
+				"\r\n",
+				"\r"
+			),
+			array(
+				"\xEF\xBF\xBD",
+				"\n",
+				"\n"
+			),
+			$data
+		);
+
+		$this->data = $data;
+		$this->char = 0;
+		$this->EOF  = strlen( $data );
+	}
+
+
+	/**
+	 * Retrieve the currently consume character.
+	 * @note This performs bounds checking
+	 */
+	public function char() {
+		return ( $this->char++ < $this->EOF )
+			? $this->data[$this->char - 1]
+			: false;
+	}
+
+	/**
+	 * Matches as far as possible until we reach a certain set of bytes
+	 * and returns the matched substring.
+	 * @param $bytes Bytes to match.
+	 */
+	public function charsUntil( $bytes, $max = null ) {
+		if ( $this->char < $this->EOF ) {
+			if ( $max === 0 || $max ) {
+				$len = strcspn( $this->data, $bytes, $this->char, $max );
+			} else {
+				$len = strcspn( $this->data, $bytes, $this->char );
+			}
+			$string = (string) substr( $this->data, $this->char, $len );
+			$this->char += $len;
+			return $string;
+		} else {
+			return false;
+		}
+	}
+
+	/**
+	 * Matches as far as possible with a certain set of bytes
+	 * and returns the matched substring.
+	 * @param $bytes Bytes to match.
+	 */
+	public function charsWhile( $bytes, $max = null ) {
+		if ( $this->char < $this->EOF ) {
+			if ( $max === 0 || $max ) {
+				$len = strspn( $this->data, $bytes, $this->char, $max );
+			} else {
+				$len = strspn( $this->data, $bytes, $this->char );
+			}
+			$string = (string) substr( $this->data, $this->char, $len );
+			$this->char += $len;
+			return $string;
+		} else {
+			return false;
+		}
+	}
+
+	/**
+	 * Unconsume one character.
+	 */
+	public function unget() {
+		if ( $this->char <= $this->EOF ) {
+			$this->char--;
+		}
+	}
+
+	/**
+	 * Get the current pointer
+	 * @return int
+	 */
+	public function getPos() {
+		return $this->char;
+	}
+
+	/**
+	 * Get the current pointer
+	 * @param int pinter into the data
+	 */
+	public function setPos( $ndx ) {
+		$this->char = $ndx;
+	}
+
+	/**
+	 * Get a substring of the data.
+	 * @param int start
+	 * @param int end
+	 * @return string
+	 */
+	public function getSubstr( $start, $end ) {
+		if ( $end < $start ) {
+			throw new Exception( 'End was before start?' );
+		}
+		$length = $end - $start;
+		return substr( $this->data, $start, $length );
+	}
+
+}
diff --git a/languages/LanguageConverter.php b/languages/LanguageConverter.php
index eae77fb..0c7809f 100644
--- a/languages/LanguageConverter.php
+++ b/languages/LanguageConverter.php
@@ -353,55 +353,66 @@ class LanguageConverter {
 			return $text;
 		}
 
-		/* we convert everything except:
+		/* Do the conversion. We convert everything except:
 		   1. HTML markups (anything between < and >)
 		   2. HTML entities
 		   3. placeholders created by the parser
 		*/
+
+		// Get regex for parser placeholders
 		global $wgParser;
+		$marker = false;
 		if ( isset( $wgParser ) && $wgParser->UniqPrefix() != '' ) {
-			$marker = '|' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+';
-		} else {
-			$marker = '';
+			$marker = '/' . $wgParser->UniqPrefix() . '[\-a-zA-Z0-9]+/s';
 		}
 
-		// this one is needed when the text is inside an HTML markup
-		$htmlfix = '|<[^>]+$|^[^<>]*>';
-
-		// disable convert to variants between <code> tags
-		$codefix = '<code>.+?<\/code>|';
-		// disable conversion of <script> tags
-		$scriptfix = '<script.*?>.*?<\/script>|';
-		// disable conversion of <pre> tags
-		$prefix = '<pre.*?>.*?<\/pre>|';
+		// Guard against delimiter nulls in the input
+		$text = str_replace( "\000", '', $text );
 
-		$reg = '/' . $codefix . $scriptfix . $prefix .
-			'<[^>]+>|&[a-zA-Z#][a-z0-9]+;' . $marker . $htmlfix . '/s';
 		$startPos = 0;
 		$sourceBlob = '';
 		$literalBlob = '';
-
-		// Guard against delimiter nulls in the input
-		$text = str_replace( "\000", '', $text );
-
 		$markupMatches = null;
 		$elementMatches = null;
-		while ( $startPos < strlen( $text ) ) {
-			if ( preg_match( $reg, $text, $markupMatches, PREG_OFFSET_CAPTURE, $startPos ) ) {
-				$elementPos = $markupMatches[0][1];
-				$element = $markupMatches[0][0];
+
+		$htmlTokenizer = new Html5Tokenizer( $text, array( 'code', 'script', 'pre', 'style' ) );
+		// Match /^[^<>]*>/ in case the $text we're working on starts with the end of
+		// a tag. We set this to false after the first match.
+		$htmlTokenizer->setFlagCloseOnly( true );
+		// Match /<[^>]+$/ in case the $text stops part way through a tag
+		$htmlTokenizer->setFlagOpenOnly( true );
+
+		do {
+			$sourceEnd = strlen( $text );
+			$hasHtml = $htmlTokenizer->parse();
+
+			if ( $hasHtml ) {
+				$elementPos = $htmlTokenizer->elementStart;
+				$element = $htmlTokenizer->element;
+				$htmlTokenizer->setFlagCloseOnly( false );
 			} else {
 				$elementPos = strlen( $text );
 				$element = '';
 			}
 
-			// Queue the part before the markup for translation in a batch
-			$sourceBlob .= substr( $text, $startPos, $elementPos - $startPos ) . "\000";
-
+			// Check for parser markers in the non-html chunk
+			list( $sources, $literals ) = $this->tokenizeParserMarkers(
+				substr( $text, $startPos, $elementPos - $startPos ),
+				$marker
+			);
+			foreach ( $sources as $s ) {
+				$sourceBlob .= "$s\000";
+			}
+			foreach ( $literals as $l ) {
+				$literalBlob .= "$l\000";
+			}
 			// Advance to the next position
 			$startPos = $elementPos + strlen( $element );
 
 			// Translate any alt or title attributes inside the matched element
+			// TODO: $htmlTokenizer already has the attributs parsed, so it would be
+			// more efficient to pull them from  $htmlTokenizer->token. But let's do that
+			// in a public patch.
 			if ( $element !== ''
 				&& preg_match( '/^(<[^>\s]*)\s([^>]*)(.*)$/', $element, $elementMatches )
 			) {
@@ -430,7 +441,7 @@ class LanguageConverter {
 				}
 			}
 			$literalBlob .= $element . "\000";
-		}
+		} while ( $hasHtml );
 
 		// Do the main translation batch
 		$translatedBlob = $this->translate( $sourceBlob, $toVariant );
@@ -451,6 +462,30 @@ class LanguageConverter {
 	}
 
 	/**
+	 * @return array exactly two arrays, the first of sources that should have
+	 *	language conversion applied, the second should not.
+	 */
+	private function tokenizeParserMarkers( $text, $regex ) {
+		if ( !$regex ) {
+			// We don't actually want to match anything
+			return array( array( $text ), array() );
+		}
+		$sourceBlobs = array();
+		$literalBlobs = array();
+		$lastEnd = 0;
+		if ( preg_match_all( $regex, $text, $markupMatches, PREG_OFFSET_CAPTURE ) ) {
+			foreach ( $markupMatches[0] as $match ) {
+				list( $string, $offset ) = $match;
+				$sourceBlobs[] = substr( $text, $lastEnd, $offset - $lastEnd );
+				$literalBlobs[] = $string;
+				$lastEnd = $offset + strlen( $string );
+			}
+		}
+		$sourceBlobs[] = substr( $text, $lastEnd );
+		return array( $sourceBlobs, $literalBlobs );
+	}
+
+	/**
 	 * Translate a string to a variant.
 	 * Doesn't parse rules or do any of that other stuff, for that use
 	 * convert() or convertTo().
diff --git a/tests/phpunit/includes/Html5TokenizerTest.php b/tests/phpunit/includes/Html5TokenizerTest.php
new file mode 100644
index 0000000..7aea2df
--- /dev/null
+++ b/tests/phpunit/includes/Html5TokenizerTest.php
@@ -0,0 +1,132 @@
+<?php
+
+/**
+ * @group medium
+ */
+class Html5TokenizerTest extends MediaWikiTestCase {
+
+	/**
+	 * @dataProvider getHtmlFragments
+	 */
+	public function testTransform( $input, $expectedFinds, $msg ) {
+		$result = array();
+		$ht = new Html5Tokenizer( $input, array( 'pre', 'code', 'style', 'script' ) );
+		$ht->setFlagCloseOnly( true );
+		$ht->setFlagOpenOnly( true );
+		do {
+			$r = $ht->parse();
+			if ( $r ) {
+				$result[] = $ht->element;
+			}
+		} while ( $r );
+
+		$this->assertArrayEquals( $expectedFinds, $result, $msg );
+	}
+
+	public function getHtmlFragments() {
+		return array(
+			array(
+				"<div bar=asdf>asdf</div>",
+				array( '<div bar=asdf>', '</div>' ),
+				'Basic unquoted attrs'
+			),
+			array(
+				"<div bar='asdf' baz=\"123\">asdf</div>",
+				array( "<div bar='asdf' baz=\"123\">", '</div>' ),
+				'Basic quoted attrs'
+			),
+			array(
+				"<div b'ar='asdf' b\"az=\"123\">asdf</div>",
+				array( "<div b'ar='asdf' b\"az=\"123\">", '</div>' ),
+				'Quoted attrs with quote in attr name'
+			),
+			array(
+				"<div bar='as>df' baz=\"123\">asdf</div>",
+				array( "<div bar='as>df' baz=\"123\">", '</div>' ),
+				'Quoted attr containing >'
+			),
+			array(
+				"<!--div bar='as>df' baz=\"123\">asdf</div -->",
+				array( "<!--div bar='as>df' baz=\"123\">asdf</div -->" ),
+				'Commented elements'
+			),
+			array(
+				"<div bar='as>>></>>>>df\' b'az='1<23'>asdf</div>",
+				array( "<div bar='as>>></>>>>df\' b'az='1<23'>", '</div>' ),
+				'Quoted attr containing <'
+			),
+			array(
+				"<div bar=foo>1</></div>",
+				array( '<div bar=foo>', '</>', '</div>' ),
+				'Immediately closed end tag'
+			),
+			array(
+				'<div "=foo>1</div>',
+				array( '<div "=foo>', '</div>' ),
+				'Attr name is single quote'
+			),
+			array(
+				"start<div \"=foo",
+				array( '<div "=foo' ),
+				'Unclosed element'
+			),
+			array(
+				'a div "=foo>end',
+				array( 'a div "=foo>' ),
+				'Unopened element close'
+			),
+			array(
+				'<pre>a div "=foo></pre>end',
+				array( '<pre>a div "=foo></pre>' ),
+				'Match entire element'
+			),
+			array(
+				'<pre id=123 id="asdf">a div "=foo></pre>end',
+				array( '<pre id=123 id="asdf">a div "=foo></pre>' ),
+				'Match entire element, with attributes'
+			),
+			array(
+				'<pre id=123 id="asdf"></prea div>< "=foo></pre>end',
+				array( '<pre id=123 id="asdf"></prea div>< "=foo></pre>' ),
+				'Check entire element matching close logic'
+			),
+			array(
+				'<pre> <? bogus comment></prea div>< "=foo></pre>end',
+				array( '<pre> <? bogus comment></prea div>< "=foo></pre>' ),
+				'Check entire element matching, don\'t break on bogus comment'
+			),
+			array(
+				'<pre> something <!-- <bar "=foo></pre> --!>  asdf</pre>end',
+				array( '<pre> something <!-- <bar "=foo></pre> --!>  asdf</pre>' ),
+				'Check entire element matching, don\'t break on end tag inside comment'
+			),
+			array(
+				'<pre> something <!-- <bar "=foo></pre>  asdf</pre>end',
+				array( '<pre> something <!-- <bar "=foo></pre>  asdf</pre>end' ),
+				'Check entire element matching, don\'t break on end tag inside comment that doesn\'t end'
+			),
+			array(
+				'start<pre> something <br/>  asdf</pre>end',
+				array( '<pre> something <br/>  asdf</pre>' ),
+				'Check entire element matching, don\t break on self-closing tags'
+			),
+			array(
+				'start<pre/> something <br/>  asdf</pre>end',
+				array( '<pre/>', '<br/>', '</pre>' ),
+				'Check entire element matching, return if tag is self closing'
+			),
+			array(
+				'start<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Stric>t//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">end',
+				array( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Stric>', 't//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">' ),
+				'Check DOCTYPE matching'
+			),
+			array(
+				'start<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"',
+				array( '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"' ),
+				'Check unclosed DOCTYPE matching'
+			),
+
+		);
+	}
+
+}
-- 
1.8.4.5

