diff --git a/includes/DT_PageComponent.php b/includes/DT_PageComponent.php index 7fc8b12..0d160c1 100644 --- a/includes/DT_PageComponent.php +++ b/includes/DT_PageComponent.php @@ -1,163 +1,167 @@ mTemplateName = trim( $templateName ); $dtPageComponent->mIsTemplate = true; $dtPageComponent->mFields = []; self::$mUnnamedFieldCounter = 1; return $dtPageComponent; } public static function newFreeText( $freeText ) { $dtPageComponent = new DTPageComponent(); $dtPageComponent->mIsTemplate = false; $dtPageComponent->mFreeText = $freeText; $dtPageComponent->mFreeTextID = self::$mFreeTextIDCounter++; return $dtPageComponent; } public function getFields() { return $this->mFields; } public function addNamedField( $fieldName, $fieldValue ) { $this->mFields[trim( $fieldName )] = trim( $fieldValue ); } public function addUnnamedField( $fieldValue ) { $fieldName = self::$mUnnamedFieldCounter++; $this->mFields[$fieldName] = trim( $fieldValue ); } public function isTemplate() { return $this->mIsTemplate; } + public function getTemplateName() { + return $this->mTemplateName; + } + public function toWikitext() { if ( $this->mIsTemplate ) { $wikitext = '{{' . $this->mTemplateName; foreach ( $this->mFields as $fieldName => $fieldValue ) { if ( is_numeric( $fieldName ) ) { $wikitext .= '|'; } else { $wikitext .= "\n|$fieldName="; } if ( is_array( $fieldValue ) ) { foreach ( $fieldValue as $subTemplate ) { $wikitext .= $subTemplate->toWikitext(); } } else { $wikitext .= $fieldValue; } } $wikitext .= "\n}}"; return $wikitext; } else { return $this->mFreeText; } } public function toXML( $isSimplified ) { global $wgDataTransferViewXMLParseFields; global $wgDataTransferViewXMLParseFreeText; global $wgTitle; $parser = MediaWikiServices::getInstance()->getParser(); if ( $this->mIsTemplate ) { if ( method_exists( 'MediaWiki\MediaWikiServices', 'getContentLanguage' ) ) { // MW 1.32+ $contLang = MediaWikiServices::getInstance()->getContentLanguage(); } else { global $wgContLang; $contLang = $wgContLang; } $namespace_labels = $contLang->getNamespaces(); $template_label = $namespace_labels[NS_TEMPLATE]; $field_str = str_replace( ' ', '_', wfMessage( 'dt_xml_field' )->inContentLanguage()->text() ); $name_str = str_replace( ' ', '_', wfMessage( 'dt_xml_name' )->inContentLanguage()->text() ); $bodyXML = ''; foreach ( $this->mFields as $fieldName => $fieldValue ) { // If this field itself holds template calls, // get the XML for those calls. if ( is_array( $fieldValue ) ) { $fieldValueXML = ''; foreach ( $fieldValue as $subComponent ) { $fieldValueXML .= $subComponent->toXML( $isSimplified ); } } elseif ( $wgDataTransferViewXMLParseFields ) { // Avoid table of contents and "edit" links $fieldValue = $parser->parse( "__NOTOC__ __NOEDITSECTION__\n" . $fieldValue, $wgTitle, ParserOptions::newFromAnon() )->getText(); } if ( $isSimplified ) { if ( is_numeric( $fieldName ) ) { // add "Field" to the beginning of the file name, since // XML tags that are simply numbers aren't allowed $fieldTag = $field_str . '_' . $fieldName; } else { $fieldTag = str_replace( ' ', '_', trim( $fieldName ) ); } $attrs = null; } else { $fieldTag = $field_str; $attrs = [ $name_str => $fieldName ]; } if ( is_array( $fieldValue ) ) { $bodyXML .= Xml::tags( $fieldTag, $attrs, $fieldValueXML ); } else { $bodyXML .= Xml::element( $fieldTag, $attrs, $fieldValue ); } } if ( $isSimplified ) { $templateName = str_replace( ' ', '_', $this->mTemplateName ); return Xml::tags( $templateName, null, $bodyXML ); } else { return Xml::tags( $template_label, [ $name_str => $this->mTemplateName ], $bodyXML ); } } else { $free_text_str = str_replace( ' ', '_', wfMessage( 'dt_xml_freetext' )->inContentLanguage()->text() ); if ( $wgDataTransferViewXMLParseFreeText ) { $freeText = $this->mFreeText; // Undo the escaping that happened before. $freeText = str_replace( [ '{', '}' ], [ '{', '}' ], $freeText ); // Get rid of table of contents. if ( method_exists( '\MediaWiki\MediaWikiServices', 'getInstance' ) ) { // MW 1.32+ $mw = \MediaWiki\MediaWikiServices::getInstance()->getMagicWordFactory()->get( 'toc' ); } else { $mw = MagicWord::get( 'toc' ); } if ( $mw->match( $freeText ) ) { $freeText = $mw->replace( '', $freeText ); } // Avoid "edit" links. $freeText = $parser->parse( "__NOTOC__ __NOEDITSECTION__\n" . $freeText, $wgTitle, ParserOptions::newFromAnon() )->getText(); } else { $freeText = $this->mFreeText; } return Xml::element( $free_text_str, [ 'id' => $this->mFreeTextID ], $freeText ); } } } diff --git a/includes/DT_PageStructure.php b/includes/DT_PageStructure.php index a16cf54..065053f 100644 --- a/includes/DT_PageStructure.php +++ b/includes/DT_PageStructure.php @@ -1,260 +1,260 @@ mComponents[] = $dtPageComponent; DTPageComponent::$mFreeTextIDCounter = 1; } public static function newFromTitle( $pageTitle ) { $pageStructure = new DTPageStructure(); $pageStructure->mPageTitle = $pageTitle; $wiki_page = WikiPage::factory( $pageTitle ); $page_contents = ContentHandler::getContentText( $wiki_page->getContent() ); $pageStructure->parsePageContents( $page_contents ); // Now, go through the field values and see if any of them // hold template calls - if any of them do, parse the value // as if it's the full contents of a page, and add the // resulting "components" to that field. foreach ( $pageStructure->mComponents as $pageComponent ) { if ( $pageComponent->isTemplate() ) { foreach ( $pageComponent->getFields() as $fieldName => $fieldValue ) { if ( strpos( $fieldValue, '{{' ) !== false ) { $dummyPageStructure = new DTPageStructure(); $dummyPageStructure->parsePageContents( $fieldValue ); $pageComponent->addNamedField( $fieldName, $dummyPageStructure->mComponents ); } } } } return $pageStructure; } /** * Parses the contents of a wiki page, turning template calls into * an array of DTPageComponent objects. */ public function parsePageContents( $page_contents ) { // escape out variables like "{{PAGENAME}}" $page_contents = str_replace( '{{PAGENAME}}', '{{PAGENAME}}', $page_contents ); // escape out parser functions $page_contents = preg_replace( '/{{(#.+)}}/', '{{$1}}', $page_contents ); // escape out transclusions, and calls like "DEFAULTSORT" $page_contents = preg_replace( '/{{(.*:.+)}}/', '{{$1}}', $page_contents ); // escape out variable names $page_contents = str_replace( '{{{', '{{{', $page_contents ); $page_contents = str_replace( '}}}', '}}}', $page_contents ); // escape out tables $page_contents = str_replace( '{|', '{|', $page_contents ); $page_contents = str_replace( '|}', '|}', $page_contents ); // Replace any links in the content with dummy names // to avoid parsing of internal link content. These // dummies are replaced back later. $linkPattern = '/\[+[^\]]*\]+/'; $linksInContent = []; preg_match_all( $linkPattern, $page_contents, $linksInContent ); for ( $i = 0; $i < count( $linksInContent ); $i++ ) { $page_contents = str_replace( $linksInContent[$i], "dummyLink$i", $page_contents ); } // traverse the page contents, one character at a time $uncompleted_curly_brackets = 0; $free_text = ""; $template_name = ""; $field_name = ""; $field_value = ""; $field_has_name = false; for ( $i = 0; $i < strlen( $page_contents ); $i++ ) { $c = $page_contents[$i]; if ( $uncompleted_curly_brackets == 0 ) { if ( $c == "{" || $i == strlen( $page_contents ) - 1 ) { if ( $i == strlen( $page_contents ) - 1 ) { $free_text .= $c; } $uncompleted_curly_brackets++; $free_text = trim( $free_text ); if ( $free_text != "" ) { $freeTextComponent = DTPageComponent::newFreeText( $free_text ); $this->addComponent( $freeTextComponent ); $free_text = ""; } } elseif ( $c == "{" ) { // do nothing } else { $free_text .= $c; } } elseif ( $uncompleted_curly_brackets == 1 ) { if ( $c == "{" ) { $uncompleted_curly_brackets++; $creating_template_name = true; } elseif ( $c == "}" ) { $uncompleted_curly_brackets--; // is this needed? // if ($field_name != "") { // $field_name = ""; // } if ( $page_contents[$i - 1] == '}' ) { $this->addComponent( $curTemplate ); } $template_name = ""; } } elseif ( $uncompleted_curly_brackets == 2 ) { if ( $c == "}" ) { $uncompleted_curly_brackets--; } if ( $c == "{" ) { $uncompleted_curly_brackets++; $field_value .= $c; } else { if ( $creating_template_name ) { if ( $c == "|" || $c == "}" ) { $curTemplate = DTPageComponent::newTemplate( $template_name ); $template_name = str_replace( ' ', '_', trim( $template_name ) ); $template_name = str_replace( '&', '&', $template_name ); $creating_template_name = false; $creating_field_name = true; $field_id = 1; } else { $template_name .= $c; } } else { if ( $c == "|" || $c == "}" ) { if ( $field_has_name ) { // Replace back the dummy links and escaped symbols with actual ones. $dummyLinkPattern = '/dummyLink(\d+)/'; if ( preg_match( $dummyLinkPattern, $field_value, $dummy ) ) { $linkNum = $dummy[1]; $field_value = str_replace( $dummy[0], $linksInContent[0][$linkNum], $field_value ); } $field_value = str_replace( '{', '{', $field_value ); $field_value = str_replace( '}', '}', $field_value ); $curTemplate->addNamedField( $field_name, $field_value ); $field_value = ""; $field_has_name = false; } else { // "field_name" is actually the value $curTemplate->addUnnamedField( $field_name ); } $creating_field_name = true; $field_name = ""; } elseif ( $c == "=" ) { // handle case of = in value if ( !$creating_field_name ) { $field_value .= $c; } else { $creating_field_name = false; $field_has_name = true; } } elseif ( $creating_field_name ) { $field_name .= $c; } else { $field_value .= $c; } } } } else { // greater than 2 if ( $c == "}" ) { $uncompleted_curly_brackets--; } elseif ( $c == "{" ) { $uncompleted_curly_brackets++; } $field_value .= $c; } } } /** * Helper function for mergeInPageStructure(). */ private function getSingleInstanceTemplates() { $instancesPerTemplate = []; foreach ( $this->mComponents as $pageComponent ) { if ( $pageComponent->isTemplate() ) { - $templateName = $pageComponent->mTemplateName; + $templateName = $pageComponent->getTemplateName(); if ( array_key_exists( $templateName, $instancesPerTemplate ) ) { $instancesPerTemplate[$templateName]++; } else { $instancesPerTemplate[$templateName] = 1; } } } $singleInstanceTemplates = []; foreach ( $instancesPerTemplate as $templateName => $instances ) { if ( $instances == 1 ) { $singleInstanceTemplates[] = $templateName; } } return $singleInstanceTemplates; } private function getIndexOfTemplateName( $templateName ) { foreach ( $this->mComponents as $i => $pageComponent ) { - if ( $pageComponent->mTemplateName == $templateName ) { + if ( $pageComponent->getTemplateName() == $templateName ) { return $i; } } return null; } /** * Used when doing a "merge" in an XML or CSV import. */ public function mergeInPageStructure( $secondPageStructure ) { // If there are any templates that have one instance in both // pages, replace values for their fields with values from // the second page. $singleInstanceTemplatesHere = $this->getSingleInstanceTemplates(); $singleInstanceTemplatesThere = $secondPageStructure->getSingleInstanceTemplates(); $singleInstanceTemplatesInBoth = array_intersect( $singleInstanceTemplatesHere, $singleInstanceTemplatesThere ); foreach ( $secondPageStructure->mComponents as $pageComponent ) { - if ( in_array( $pageComponent->mTemplateName, $singleInstanceTemplatesInBoth ) ) { - $indexOfThisTemplate = $this->getIndexOfTemplateName( $pageComponent->mTemplateName ); + if ( in_array( $pageComponent->getTemplateName(), $singleInstanceTemplatesInBoth ) ) { + $indexOfThisTemplate = $this->getIndexOfTemplateName( $pageComponent->getTemplateName() ); foreach ( $pageComponent->getFields() as $fieldName => $fieldValue ) { $this->mComponents[$indexOfThisTemplate]->addNamedField( $fieldName, $fieldValue ); } } else { $this->mComponents[] = $pageComponent; } } } public function toWikitext() { $wikitext = ''; foreach ( $this->mComponents as $pageComponent ) { $wikitext .= $pageComponent->toWikitext() . "\n"; } return trim( $wikitext ); } public function toXML( $isSimplified ) { $page_str = str_replace( ' ', '_', wfMessage( 'dt_xml_page' )->inContentLanguage()->text() ); $id_str = str_replace( ' ', '_', wfMessage( 'dt_xml_id' )->inContentLanguage()->text() ); $title_str = str_replace( ' ', '_', wfMessage( 'dt_xml_title' )->inContentLanguage()->text() ); $bodyXML = ''; foreach ( $this->mComponents as $pageComponent ) { $bodyXML .= $pageComponent->toXML( $isSimplified ); } $articleID = $this->mPageTitle->getArticleID(); $pageName = $this->mPageTitle->getText(); if ( $isSimplified ) { return Xml::tags( $page_str, null, Xml::tags( $id_str, null, $articleID ) . Xml::tags( $title_str, null, $pageName ) . $bodyXML ); } else { return Xml::tags( $page_str, [ $id_str => $articleID, $title_str => $pageName ], $bodyXML ); } } }