From cc0aa1784ab713b7fe7f1290d342affa88261de9 Mon Sep 17 00:00:00 2001
From: csteipp <csteipp@wikimedia.org>
Date: Thu, 12 Mar 2015 15:49:22 -0700
Subject: [PATCH] SECURITY: Don't allow entities in XMP with HHVM

Test for, and refuse to parse, XMP chunks with a doctype declaration
when parsing XMP under HHVM.

Bug: T85848
Change-Id: Iea4feb077ee85a35509a920153daaa9321ee69f3
---
 includes/media/BitmapMetadataHandler.php           |  6 +-
 includes/media/JpegMetadataExtractor.php           |  2 +-
 includes/media/XMP.php                             | 96 ++++++++++++++++++++++
 tests/phpunit/data/xmp/doctype-included.result.php |  3 +
 tests/phpunit/data/xmp/doctype-included.xmp        | 12 +++
 tests/phpunit/includes/media/XMPTest.php           | 37 +++++++++
 6 files changed, 152 insertions(+), 4 deletions(-)
 create mode 100644 tests/phpunit/data/xmp/doctype-included.result.php
 create mode 100644 tests/phpunit/data/xmp/doctype-included.xmp

diff --git a/includes/media/BitmapMetadataHandler.php b/includes/media/BitmapMetadataHandler.php
index bb7a1e8..c8d37bb 100644
--- a/includes/media/BitmapMetadataHandler.php
+++ b/includes/media/BitmapMetadataHandler.php
@@ -154,7 +154,7 @@ class BitmapMetadataHandler {
 	 * @throws MWException On invalid file.
 	 */
 	static function Jpeg( $filename ) {
-		$showXMP = function_exists( 'xml_parser_create_ns' );
+		$showXMP = XMPReader::isSupported();
 		$meta = new self();
 
 		$seg = JpegMetadataExtractor::segmentSplitter( $filename );
@@ -196,7 +196,7 @@ class BitmapMetadataHandler {
 	 * @return array Array for storage in img_metadata.
 	 */
 	public static function PNG( $filename ) {
-		$showXMP = function_exists( 'xml_parser_create_ns' );
+		$showXMP = XMPReader::isSupported();
 
 		$meta = new self();
 		$array = PNGMetadataExtractor::getMetadata( $filename );
@@ -236,7 +236,7 @@ class BitmapMetadataHandler {
 			$meta->addMetadata( array( 'GIFFileComment' => $baseArray['comment'] ), 'native' );
 		}
 
-		if ( $baseArray['xmp'] !== '' && function_exists( 'xml_parser_create_ns' ) ) {
+		if ( $baseArray['xmp'] !== '' && XMPReader::isSupported() ) {
 			$xmp = new XMPReader();
 			$xmp->parse( $baseArray['xmp'] );
 			$xmpRes = $xmp->getResults();
diff --git a/includes/media/JpegMetadataExtractor.php b/includes/media/JpegMetadataExtractor.php
index 8c5b46b..aaa9930 100644
--- a/includes/media/JpegMetadataExtractor.php
+++ b/includes/media/JpegMetadataExtractor.php
@@ -48,7 +48,7 @@ class JpegMetadataExtractor {
 	 * @throws MWException If given invalid file.
 	 */
 	static function segmentSplitter( $filename ) {
-		$showXMP = function_exists( 'xml_parser_create_ns' );
+		$showXMP = XMPReader::isSupported();
 
 		$segmentCount = 0;
 
diff --git a/includes/media/XMP.php b/includes/media/XMP.php
index 0d341aa..b66e150 100644
--- a/includes/media/XMP.php
+++ b/includes/media/XMP.php
@@ -80,6 +80,12 @@ class XMPReader {
 	/** @var int */
 	private $extendedXMPOffset = 0;
 
+	/** @var int Flag determining if the XMP is safe to parse **/
+	private $parsable = 0;
+
+	/** @var string Buffer of XML to parse **/
+	private $xmlParsableBuffer = '';
+
 	/**
 	 * These are various mode constants.
 	 * they are used to figure out what to do
@@ -108,6 +114,12 @@ class XMPReader {
 	const NS_RDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
 	const NS_XML = 'http://www.w3.org/XML/1998/namespace';
 
+	// States used while determining if XML is safe to parse
+	const PARSABLE_UNKNOWN = 0;
+	const PARSABLE_OK = 1;
+	const PARSABLE_BUFFERING = 2;
+	const PARSABLE_NO = 3;
+
 	/**
 	 * Constructor.
 	 *
@@ -145,6 +157,9 @@ class XMPReader {
 			array( $this, 'endElement' ) );
 
 		xml_set_character_data_handler( $this->xmlParser, array( $this, 'char' ) );
+
+		$this->parsable = self::PARSABLE_UNKNOWN;
+		$this->xmlParsableBuffer = '';
 	}
 
 	/** Destroy the xml parser
@@ -156,6 +171,14 @@ class XMPReader {
 		xml_parser_free( $this->xmlParser );
 	}
 
+	/**
+	 * Check if this instance supports using this class
+	 */
+	public static function isSupported() {
+		$xmlReader = wfIsHHVM() ? class_exists( 'XMLReader' ) : true;
+		return function_exists( 'xml_parser_create_ns' ) && $xmlReader;
+	}
+
 	/** Get the result array. Do some post-processing before returning
 	 * the array, and transform any metadata that is special-cased.
 	 *
@@ -305,6 +328,22 @@ class XMPReader {
 				wfRestoreWarnings();
 			}
 
+			// Ensure the XMP block does not have an xml doctype declaration, which
+			// could declare entities unsafe to parse with xml_parse on HHVM (T85848).
+			if ( wfIsHHVM() && $this->parsable !== self::PARSABLE_OK ) {
+				if ( $this->parsable === self::PARSABLE_NO ) {
+					throw new Exception( 'Unsafe doctype declaration in XML.' );
+				}
+
+				$content = $this->xmlParsableBuffer . $content;
+				if ( !$this->checkParseSafety( $content ) ) {
+					$msg = ( $this->parsable === self::PARSABLE_NO ) ?
+						'Unsafe doctype declaration in XML.' :
+						'Not enough XML to check for safety, buffering.';
+					throw new Exception( $msg );
+				}
+			}
+
 			$ok = xml_parse( $this->xmlParser, $content, $allOfIt );
 			if ( !$ok ) {
 				$error = xml_error_string( xml_get_error_code( $this->xmlParser ) );
@@ -437,6 +476,63 @@ class XMPReader {
 		}
 	}
 
+	/**
+	 * Check if a block of XML is safe to pass to xml_parse, i.e. doesn't
+	 * contain a doctype declaration which could contain a dos attack if we
+	 * parse it and expand internal entities (T85848).
+	 *
+	 * @param string $content xml string to check for parse safety
+	 * @return bool true if the xml is safe to parse, false otherwise
+	 */
+	private function checkParseSafety( $content ) {
+		$reader = new XMLReader();
+		$result = null;
+
+		// For XMLReader to parse incomplete/invalid XML, it has to be open()'ed
+		// instead of using XML(). XMP blocks must be less than 64KB, so urlencoding
+		// isn't completely crazy.
+		$reader->open(
+			'data://text/plain,' . urlencode( $content ),
+			null,
+			LIBXML_NOERROR | LIBXML_NOWARNING | LIBXML_NONET
+		);
+
+		$oldDisable = libxml_disable_entity_loader( true );
+		$reset = new ScopedCallback(
+			'libxml_disable_entity_loader',
+			array( $oldDisable )
+		);
+		$reader->setParserProperty( XMLReader::SUBST_ENTITIES, false );
+
+		// Even with LIBXML_NOWARNING set, XMLReader::read gives a warning
+		// when parsing truncated XML, which causes unit tests to fail.
+		wfSuppressWarnings();
+		while ( $reader->read() ) {
+			if ( $reader->nodeType === XMLReader::ELEMENT ) {
+				// Reached the first element without hitting a doctype declaration
+				$this->parsable = self::PARSABLE_OK;
+				$result = true;
+				break;
+			}
+			if ( $reader->nodeType === XMLReader::DOC_TYPE ) {
+				$this->parsable = self::PARSABLE_NO;
+				$result = false;
+				break;
+			}
+		}
+		wfRestoreWarnings();
+
+		if ( !is_null( $result ) ) {
+			return $result;
+		}
+
+		// Reached the end of the parsable xml without finding an element
+		// or doctype. Buffer and try again.
+		$this->parsable = self::PARSABLE_BUFFERING;
+		$this->xmlParsableBuffer = $content;
+		return false;
+	}
+
 	/** When we hit a closing element in MODE_IGNORE
 	 * Check to see if this is the element we started to ignore,
 	 * in which case we get out of MODE_IGNORE
diff --git a/tests/phpunit/data/xmp/doctype-included.result.php b/tests/phpunit/data/xmp/doctype-included.result.php
new file mode 100644
index 0000000..9a9cc36
--- /dev/null
+++ b/tests/phpunit/data/xmp/doctype-included.result.php
@@ -0,0 +1,3 @@
+<?php
+
+$result = array();
diff --git a/tests/phpunit/data/xmp/doctype-included.xmp b/tests/phpunit/data/xmp/doctype-included.xmp
new file mode 100644
index 0000000..8c94675
--- /dev/null
+++ b/tests/phpunit/data/xmp/doctype-included.xmp
@@ -0,0 +1,12 @@
+<?xpacket begin="﻿" id="W5M0MpCehiHzreSzNTczkc9d"?> <!DOCTYPE x:xmpmeta [ <!ENTITY lol "lollollollollollollollollollollol"> ]>
+<x:xmpmeta xmlns:x="adobe:ns:meta/" x:xmptk="Adobe XMP Core
+ 4.1.3-c001 49.282696, Mon Apr 02 2007 21:16:10        ">
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+<rdf:Description
+ rdf:about=""
+ xmlns:exif="http://ns.adobe.com/exif/1.0/"
+ exif:DigitalZoomRatio="0/10">
+<exif:Flash rdf:parseType='Resource'>
+<exif:Fired>True</exif:Fired> <exif:Return>0</exif:Return> <exif:Mode>1</exif:Mode> <exif:Function>False</exif:Function> <exif:RedEyeMode>False</exif:RedEyeMode></exif:Flash> </rdf:Description> </rdf:RDF> </x:xmpmeta>
+
+<?xpacket end="w"?>
diff --git a/tests/phpunit/includes/media/XMPTest.php b/tests/phpunit/includes/media/XMPTest.php
index 798e492..df31994 100644
--- a/tests/phpunit/includes/media/XMPTest.php
+++ b/tests/phpunit/includes/media/XMPTest.php
@@ -62,6 +62,10 @@ class XMPTest extends MediaWikiTestCase {
 			array( 'gps', 'Handling of exif GPS parameters in XMP' ),
 		);
 
+		if ( wfIsHHVM() ) {
+			$xmpFiles[] = array( 'doctype-included', 'XMP includes doctype' );
+		}
+
 		foreach ( $xmpFiles as $file ) {
 			$xmp = file_get_contents( $xmpPath . $file[0] . '.xmp' );
 			// I'm not sure if this is the best way to handle getting the
@@ -170,4 +174,37 @@ class XMPTest extends MediaWikiTestCase {
 
 		$this->assertEquals( $expected, $actual );
 	}
+
+	/**
+	 * Test for multi-section, hostile XML
+	 * @covers checkParseSafety
+	 */
+	public function testCheckParseSafety() {
+		if ( !wfIsHHVM() ) {
+			$this->markTestSkipped( 'No safety check for non-hhvm' );
+		}
+
+		// Test for detection
+		$xmpPath = __DIR__ . '/../../data/xmp/';
+		$file = fopen( $xmpPath . 'doctype-included.xmp', 'rb' );
+		$valid = false;
+		$reader = new XMPReader();
+		do {
+			$chunk = fread( $file, 10 );
+			$valid = $reader->parse( $chunk, feof( $file ) ) || $valid;
+		} while ( !feof( $file ) );
+		$this->assertFalse( $valid );
+		fclose( $file );
+		unset( $reader );
+
+		// Test for false positives
+		$file = fopen( $xmpPath . '1.xmp', 'rb' );
+		$valid = false;
+		$reader = new XMPReader();
+		do {
+			$chunk = fread( $file, 10 );
+			$valid = $reader->parse( $chunk, feof( $file ) ) || $valid;
+		} while ( !feof( $file ) );
+		$this->assertTrue( $valid );
+	}
 }
-- 
1.8.4.5

