From 2b0359cc805778af52cd415b02e6db6f7f7e2c58 Mon Sep 17 00:00:00 2001 From: Lucas Werkmeister Date: Tue, 24 Nov 2020 20:16:18 +0100 Subject: [PATCH] SECURITY: Add job to purge entity data on page deletion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make EntityDataPurger handle a second hook, for when an entire page is deleted; however, we don’t handle the deletion directly in the hook handler. Instead, define a new job class to purge the entity data URLs for all archived revisions of a deleted page, and make the hook handler push that job. Deferring the purging to a job ensures that the main transaction (archiving the rows) has committed and we can therefore read the archived rows from a replica (though we still need to wait for replication, in case the job runs immediately), and it’s probably also a good idea in general to not read an unbounded number of archive rows in the web request that is deleting the page. The dependency injection pattern of PurgeEntityDataJob is mainly based on ChangeVisibilityNotificationJob; the practice of using the standard 'namespace' and 'title' params even for a deleted page follows DeletePageJob (MediaWiki core). An additional feature not implemented here would be to pass the hook handler’s $archivedRevisionCount into the job parameters, and have the job log a warning if it processed a different number of archive rows. However, I’ll leave that for later, once this patch becomes public. Bug: T260349 Change-Id: I8f88178721c6c941530a03ca431725820a24d468 --- extension-repo.json | 2 + repo/includes/Hooks/EntityDataPurger.php | 50 +++++- repo/includes/PurgeEntityDataJob.php | 111 ++++++++++++++ .../includes/Hooks/EntityDataPurgerTest.php | 91 ++++++++++- .../includes/PurgeEntityDataJobTest.php | 145 ++++++++++++++++++ 5 files changed, 393 insertions(+), 6 deletions(-) create mode 100644 repo/includes/PurgeEntityDataJob.php create mode 100644 repo/tests/phpunit/includes/PurgeEntityDataJobTest.php diff --git a/extension-repo.json b/extension-repo.json index 32edde2886..b97735d4c5 100644 --- a/extension-repo.json +++ b/extension-repo.json @@ -269,6 +269,7 @@ }, "JobClasses": { "CleanTermsIfUnused": "Wikibase\\Lib\\Store\\Sql\\Terms\\CleanTermsIfUnusedJob::getJobSpecification", + "PurgeEntityData": "Wikibase\\Repo\\PurgeEntityDataJob::newFromGlobalState", "UpdateRepoOnMove": "Wikibase\\Repo\\UpdateRepo\\UpdateRepoOnMoveJob", "UpdateRepoOnDelete": "Wikibase\\Repo\\UpdateRepo\\UpdateRepoOnDeleteJob", "DispatchChangeDeletionNotification": "Wikibase\\Repo\\ChangeModification\\DispatchChangeDeletionNotificationJob" @@ -530,6 +531,7 @@ "ApiQuery::moduleManager": "\\Wikibase\\Repo\\RepoHooks::onApiQueryModuleManager", "ArticleDeleteComplete": [ "DeleteDispatcher", + "EntityDataPurger", "\\Wikibase\\Repo\\RepoHooks::onArticleDeleteComplete" ], "ArticleRevisionVisibilitySet": [ diff --git a/repo/includes/Hooks/EntityDataPurger.php b/repo/includes/Hooks/EntityDataPurger.php index aaa63ed94a..b82ad894ed 100644 --- a/repo/includes/Hooks/EntityDataPurger.php +++ b/repo/includes/Hooks/EntityDataPurger.php @@ -5,7 +5,10 @@ namespace Wikibase\Repo\Hooks; use HtmlCacheUpdater; +use JobQueueGroup; +use JobSpecification; use MediaWiki\Hook\ArticleRevisionVisibilitySetHook; +use MediaWiki\Page\Hook\ArticleDeleteCompleteHook; use Title; use Wikibase\Lib\Store\EntityIdLookup; use Wikibase\Repo\LinkedData\EntityDataUriManager; @@ -14,7 +17,7 @@ /** * @license GPL-2.0-or-later */ -class EntityDataPurger implements ArticleRevisionVisibilitySetHook { +class EntityDataPurger implements ArticleRevisionVisibilitySetHook, ArticleDeleteCompleteHook { /** @var EntityIdLookup */ private $entityIdLookup; @@ -25,14 +28,19 @@ class EntityDataPurger implements ArticleRevisionVisibilitySetHook { /** @var HtmlCacheUpdater */ private $htmlCacheUpdater; + /** @var callable */ + private $jobQueueGroupFactory; + public function __construct( EntityIdLookup $entityIdLookup, EntityDataUriManager $entityDataUriManager, - HtmlCacheUpdater $htmlCacheUpdater + HtmlCacheUpdater $htmlCacheUpdater, + callable $jobQueueGroupFactory ) { $this->entityIdLookup = $entityIdLookup; $this->entityDataUriManager = $entityDataUriManager; $this->htmlCacheUpdater = $htmlCacheUpdater; + $this->jobQueueGroupFactory = $jobQueueGroupFactory; } public static function factory( HtmlCacheUpdater $htmlCacheUpdater ): self { @@ -40,7 +48,8 @@ public static function factory( HtmlCacheUpdater $htmlCacheUpdater ): self { return new self( $wikibaseRepo->getEntityIdLookup(), $wikibaseRepo->getEntityDataUriManager(), - $htmlCacheUpdater + $htmlCacheUpdater, + 'JobQueueGroup::singleton' ); } @@ -68,4 +77,39 @@ public function onArticleRevisionVisibilitySet( $title, $ids, $visibilityChangeM } } + /** + * @param \WikiPage $wikiPage + * @param \User $user + * @param string $reason + * @param int $id + * @param \Content|null $content + * @param \ManualLogEntry $logEntry + * @param int $archivedRevisionCount + * @return bool|void + */ + public function onArticleDeleteComplete( + $wikiPage, + $user, + $reason, + $id, + $content, + $logEntry, + $archivedRevisionCount + ) { + $title = $wikiPage->getTitle(); + $entityId = $this->entityIdLookup->getEntityIdForTitle( $title ); + if ( !$entityId ) { + return; + } + + /** @var JobQueueGroup $jobQueueGroup */ + '@phan-var JobQueueGroup $jobQueueGroup'; + $jobQueueGroup = ( $this->jobQueueGroupFactory )(); + $jobQueueGroup->push( new JobSpecification( 'PurgeEntityData', [ + 'namespace' => $title->getNamespace(), + 'title' => $title->getDBkey(), + 'pageId' => $id, + 'entityId' => $entityId->getSerialization(), + ] ) ); + } } diff --git a/repo/includes/PurgeEntityDataJob.php b/repo/includes/PurgeEntityDataJob.php new file mode 100644 index 0000000000..7c135e6b97 --- /dev/null +++ b/repo/includes/PurgeEntityDataJob.php @@ -0,0 +1,111 @@ +entityIdParser = $entityIdParser; + $this->entityDataUriManager = $entityDataUriManager; + $this->lbFactory = $lbFactory; + $this->htmlCacheUpdater = $htmlCacheUpdater; + $this->batchSize = $batchSize; + } + + public static function newFromGlobalState( Title $unused, array $params ): self { + $repo = WikibaseRepo::getDefaultInstance(); + $services = MediaWikiServices::getInstance(); + return new self( + $repo->getEntityIdParser(), + $repo->getEntityDataUriManager(), + $services->getDBLoadBalancerFactory(), + $services->getHtmlCacheUpdater(), + $services->getMainConfig()->get( 'UpdateRowsPerQuery' ), + $params + ); + } + + public function run(): void { + $entityId = $this->entityIdParser->parse( $this->params['entityId'] ); + + // URLs for latest data... + $urls = $this->entityDataUriManager->getPotentiallyCachedUrls( $entityId ); + foreach ( $this->getArchivedRevisionIds() as $revisionId ) { + // ...and URLs for each specific revision + $urls = array_merge( $urls, $this->entityDataUriManager->getPotentiallyCachedUrls( + $entityId, + $revisionId + ) ); + } + + if ( $urls !== [] ) { + $this->htmlCacheUpdater->purgeUrls( $urls ); + } + } + + private function getArchivedRevisionIds(): iterable { + // read archive rows from a replica, but only after it has caught up with the master + // (in theory we only need to wait for the master pos as of when the job was pushed, + // but DBMasterPos is an opaque object, so we can’t put it in the job params) + $this->lbFactory->waitForReplication( [ + 'domain' => $this->lbFactory->getLocalDomainID(), + ] ); + $dbr = $this->lbFactory->getMainLB()->getConnection( ILoadBalancer::DB_REPLICA ); + + $iterator = new BatchRowIterator( $dbr, 'archive', [ 'ar_id' ], $this->batchSize ); + $iterator->setFetchColumns( [ 'ar_rev_id' ] ); + $iterator->addConditions( [ + 'ar_page_id' => $this->params['pageId'], + 'ar_title' => $this->params['title'], + 'ar_namespace' => $this->params['namespace'], + ] ); + $iterator->setCaller( __METHOD__ ); + + foreach ( $iterator as $batch ) { + foreach ( $batch as $row ) { + yield (int)$row->ar_rev_id; + } + } + } + +} diff --git a/repo/tests/phpunit/includes/Hooks/EntityDataPurgerTest.php b/repo/tests/phpunit/includes/Hooks/EntityDataPurgerTest.php index a58766c619..e4b9f3ac6d 100644 --- a/repo/tests/phpunit/includes/Hooks/EntityDataPurgerTest.php +++ b/repo/tests/phpunit/includes/Hooks/EntityDataPurgerTest.php @@ -5,13 +5,17 @@ namespace Wikibase\Repo\Tests\Hooks; use HtmlCacheUpdater; +use IJobSpecification; +use JobQueueGroup; use PHPUnit\Framework\TestCase; +use stdClass; use Title; use Wikibase\DataModel\Entity\ItemId; use Wikibase\Lib\Store\EntityIdLookup; use Wikibase\Repo\Hooks\EntityDataPurger; use Wikibase\Repo\LinkedData\EntityDataRequestHandler; use Wikibase\Repo\LinkedData\EntityDataUriManager; +use WikiPage; /** * @covers \Wikibase\Repo\Hooks\EntityDataPurger @@ -22,6 +26,20 @@ */ class EntityDataPurgerTest extends TestCase { + private function mockJobQueueGroupFactory( JobQueueGroup $jobQueueGroup = null ): callable { + $factory = $this->getMockBuilder( stdClass::class ) + ->addMethods( [ 'factory' ] ) + ->getMock(); + if ( $jobQueueGroup !== null ) { + $factory->method( 'factory' ) + ->willReturn( $jobQueueGroup ); + } else { + $factory->expects( $this->never() ) + ->method( 'factory' ); + } + return [ $factory, 'factory' ]; + } + public function testGivenEntityIdLookupReturnsNull_handlerDoesNothing() { $title = Title::newFromText( 'Project:About' ); $entityIdLookup = $this->createMock( EntityIdLookup::class ); @@ -35,7 +53,12 @@ public function testGivenEntityIdLookupReturnsNull_handlerDoesNothing() { $htmlCacheUpdater = $this->createMock( HtmlCacheUpdater::class ); $htmlCacheUpdater->expects( $this->never() ) ->method( 'purgeUrls' ); - $purger = new EntityDataPurger( $entityIdLookup, $entityDataUriManager, $htmlCacheUpdater ); + $purger = new EntityDataPurger( + $entityIdLookup, + $entityDataUriManager, + $htmlCacheUpdater, + $this->mockJobQueueGroupFactory() + ); $purger->onArticleRevisionVisibilitySet( $title, [ 1, 2, 3 ], [] ); } @@ -57,7 +80,12 @@ public function testGivenEntityIdLookupReturnsId_handlerPurgesCache() { $htmlCacheUpdater->expects( $this->once() ) ->method( 'purgeUrls' ) ->with( [ 'urlA/Q1/1', 'urlB/Q1/1' ] ); - $purger = new EntityDataPurger( $entityIdLookup, $entityDataUriManager, $htmlCacheUpdater ); + $purger = new EntityDataPurger( + $entityIdLookup, + $entityDataUriManager, + $htmlCacheUpdater, + $this->mockJobQueueGroupFactory() + ); $purger->onArticleRevisionVisibilitySet( $title, [ 1 ], [] ); } @@ -91,8 +119,65 @@ public function testGivenMultipleRevisions_handlerPurgesCacheOnce() { 'urlA/Q1/2', 'urlB/Q1/2', 'urlA/Q1/3', 'urlB/Q1/3', ] ); - $purger = new EntityDataPurger( $entityIdLookup, $entityDataUriManager, $htmlCacheUpdater ); + $purger = new EntityDataPurger( + $entityIdLookup, + $entityDataUriManager, + $htmlCacheUpdater, + $this->mockJobQueueGroupFactory() + ); $purger->onArticleRevisionVisibilitySet( $title, [ 1, 2, 3 ], [] ); } + + public function testDeletionHandlerPushesJob() { + $title = Title::makeTitle( 0, 'Q123' ); + $wikiPage = $this->createMock( WikiPage::class ); + $wikiPage->method( 'getTitle' ) + ->willReturn( $title ); + + $entityIdLookup = $this->createMock( EntityIdLookup::class ); + $entityIdLookup->method( 'getEntityIdForTitle' ) + ->with( $title ) + ->willReturn( new ItemId( 'Q123' ) ); + $entityDataUriManager = $this->createMock( EntityDataUriManager::class ); + $entityDataUriManager->expects( $this->never() ) + ->method( 'getPotentiallyCachedUrls' ); + $htmlCacheUpdater = $this->createMock( HtmlCacheUpdater::class ); + $htmlCacheUpdater->expects( $this->never() ) + ->method( 'purgeUrls' ); + + $jobQueueGroup = $this->createMock( JobQueueGroup::class ); + $jobQueueGroup->expects( $this->once() ) + ->method( 'push' ) + ->with( $this->callback( function ( IJobSpecification $specification ) { + $this->assertSame( 'PurgeEntityData', $specification->getType() ); + $expectedParams = [ + 'namespace' => 0, + 'title' => 'Q123', + 'pageId' => 123, + 'entityId' => 'Q123', + ]; + $actualParams = $specification->getParams(); + ksort( $expectedParams ); + ksort( $actualParams ); + $this->assertSame( $expectedParams, $actualParams ); + return true; + } ) ); + + $purger = new EntityDataPurger( + $entityIdLookup, + $entityDataUriManager, + $htmlCacheUpdater, + $this->mockJobQueueGroupFactory( $jobQueueGroup ) + ); + + $purger->onArticleDeleteComplete( + $wikiPage, + // unused + null, null, + 123, + // unused + null, null, null + ); + } } diff --git a/repo/tests/phpunit/includes/PurgeEntityDataJobTest.php b/repo/tests/phpunit/includes/PurgeEntityDataJobTest.php new file mode 100644 index 0000000000..9404618528 --- /dev/null +++ b/repo/tests/phpunit/includes/PurgeEntityDataJobTest.php @@ -0,0 +1,145 @@ +tablesUsed[] = 'archive'; + } + + public function addDBData() { + $this->db->truncate( 'archive', __METHOD__ ); // T265033 + $this->db->insert( 'archive', [ + // relevant rows: the corresponding URLs should be purged + [ + 'ar_id' => 1, + 'ar_namespace' => 0, + 'ar_title' => 'Q123', + 'ar_page_id' => 123, + 'ar_rev_id' => 1234, + ], + [ + 'ar_id' => 2, + 'ar_namespace' => 0, + 'ar_title' => 'Q123', + 'ar_page_id' => 123, + 'ar_rev_id' => 1235, + ], + // irrelevant row: wrong namespace + [ + 'ar_id' => 3, + 'ar_namespace' => 1, + 'ar_title' => 'Q123', + 'ar_page_id' => 123, + 'ar_rev_id' => 1236, + ], + // irrelevant row: wrong title + [ + 'ar_id' => 4, + 'ar_namespace' => 0, + 'ar_title' => 'Q124', + 'ar_page_id' => 123, + 'ar_rev_id' => 1237, + ], + // irrelevant row: wrong page ID + // (possible if the same namespace+title was deleted several times) + [ + 'ar_id' => 5, + 'ar_namespace' => 1, + 'ar_title' => 'Q123', + 'ar_page_id' => 124, + 'ar_rev_id' => 1238, + ], + ], __METHOD__ ); + } + + public function testRun_purgesPotentiallyCachedUrls() { + $itemId = new ItemId( 'Q123' ); + + $entityDataUriManager = $this->createMock( EntityDataUriManager::class ); + $entityDataUriManager->expects( $this->exactly( 3 ) ) + ->method( 'getPotentiallyCachedUrls' ) + ->withConsecutive( + [ $itemId ], + [ $itemId, 1234 ], + [ $itemId, 1235 ] + ) + ->willReturnOnConsecutiveCalls( + [ '/Special:EntityData/Q123' ], + [ '/Special:EntityData/Q123?revision=1234' ], + [ '/Special:EntityData/Q123?revision=1235' ] + ); + + $htmlCacheUpdater = $this->createMock( HtmlCacheUpdater::class ); + $htmlCacheUpdater->expects( $this->once() ) + ->method( 'purgeUrls' ) + ->with( [ + '/Special:EntityData/Q123', + '/Special:EntityData/Q123?revision=1234', + '/Special:EntityData/Q123?revision=1235', + ] ); + + $job = new PurgeEntityDataJob( + new ItemIdParser(), + $entityDataUriManager, + new LBFactorySingle( [ 'connection' => $this->db ] ), + $htmlCacheUpdater, + 1, + [ + 'namespace' => 0, + 'title' => 'Q123', + 'pageId' => 123, + 'entityId' => 'Q123', + ] + ); + + $job->run(); + } + + public function testRun_doesNotCallPurgeWithEmptyUrls() { + $entityDataUriManager = $this->createMock( EntityDataUriManager::class ); + $entityDataUriManager->method( 'getPotentiallyCachedUrls' ) + ->willReturn( [] ); + + $htmlCacheUpdater = $this->createMock( HtmlCacheUpdater::class ); + $htmlCacheUpdater->expects( $this->never() ) + ->method( 'purgeUrls' ); + + $job = new PurgeEntityDataJob( + new ItemIdParser(), + $entityDataUriManager, + new LBFactorySingle( [ 'connection' => $this->db ] ), + $htmlCacheUpdater, + 100, + [ + 'namespace' => 0, + 'title' => 'Q456', + 'pageId' => 456, + 'entityId' => 'Q456', + ] + ); + + $job->run(); + } + +} -- 2.27.0