From 9b25d404ce8d512aaffb90713fa126db8121b8e3 Mon Sep 17 00:00:00 2001 From: Daniel Kraus Date: Thu, 29 Dec 2016 07:53:58 +0100 Subject: [PATCH 1/2] Cache page titles. - Improvement: Increase performance of special page and maintenance script. --- includes/LinkTitles_Extension.php | 135 +++++++++++++++++------------- 1 file changed, 77 insertions(+), 58 deletions(-) diff --git a/includes/LinkTitles_Extension.php b/includes/LinkTitles_Extension.php index d983dd3..77de25e 100644 --- a/includes/LinkTitles_Extension.php +++ b/includes/LinkTitles_Extension.php @@ -30,6 +30,15 @@ function dump($var) { /// Central class of the extension. Sets up parser hooks. /// This class contains only static functions; do not instantiate. class Extension { + /// Caching variable for page titles that are fetched from the DB. + private static $pageTitles; + + /// Caching variable for the current namespace. + /// This is needed because the sort order of the page titles that + /// are cached in self::$pageTitles depends on the namespace of + /// the page currently being processed. + private static $currentNamespace; + /// A Title object for the page that is being parsed. private static $currentTitle; @@ -113,76 +122,23 @@ class Extension { private static function parseContent( $title, &$text ) { // Configuration variables need to be defined here as globals. - global $wgLinkTitlesPreferShortTitles; - global $wgLinkTitlesMinimumTitleLength; - global $wgLinkTitlesBlackList; global $wgLinkTitlesFirstOnly; global $wgLinkTitlesSmartMode; global $wgCapitalLinks; - global $wgLinkTitlesNamespaces; - ( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC'; ( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1; $limitReached = false; - self::$currentTitle = $title; + $currentNamespace = $title->getNamespace(); $newText = $text; - // Build a blacklist of pages that are not supposed to be link - // targets. This includes the current page. - $blackList = str_replace( ' ', '_', - '("' . implode( '","',$wgLinkTitlesBlackList ) . '","' . - addslashes( self::$currentTitle->getDbKey() ) . '")' ); - - $currentNamespace[] = $title->getNamespace(); - - // Build our weight list. Make sure current namespace is first element - $namespaces = array_diff($wgLinkTitlesNamespaces, $currentNamespace); - array_unshift($namespaces, $currentNamespace[0] ); - - // No need for sanitiy check. we are sure that we have at least one element in the array - $weightSelect = "CASE page_namespace "; - $currentWeight = 0; - foreach ($namespaces as &$namspacevalue) { - $currentWeight = $currentWeight + 100; - $weightSelect = $weightSelect . " WHEN " . $namspacevalue . " THEN " . $currentWeight . PHP_EOL; - } - $weightSelect = $weightSelect . " END "; - $namespacesClause = '(' . implode( ', ', $namespaces ) . ')'; - - // Build an SQL query and fetch all page titles ordered by length from - // shortest to longest. Only titles from 'normal' pages (namespace uid - // = 0) are returned. Since the db may be sqlite, we need a try..catch - // structure because sqlite does not support the CHAR_LENGTH function. - $dbr = wfGetDB( DB_SLAVE ); - try { - $res = $dbr->select( - 'page', - array( 'page_title', 'page_namespace' , "weight" => $weightSelect), - array( - 'page_namespace IN ' . $namespacesClause, - 'CHAR_LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength, - 'page_title NOT IN ' . $blackList, - ), - __METHOD__, - array( 'ORDER BY' => 'weight ASC, CHAR_LENGTH(page_title) ' . $sort_order ) - ); - } catch (Exception $e) { - $res = $dbr->select( - 'page', - array( 'page_title', 'page_namespace' , "weight" => $weightSelect ), - array( - 'page_namespace IN ' . $namespacesClause, - 'LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength, - 'page_title NOT IN ' . $blackList, - ), - __METHOD__, - array( 'ORDER BY' => 'weight ASC, LENGTH(page_title) ' . $sort_order ) - ); + if ( !isset( self::$pageTitles ) || ( $currentNamespace != self::$currentNamespace ) ) { + self::$currentNamespace = $currentNamespace; + self::$pageTitles = self::fetchPageTitles( $currentNamespace ); } // Iterate through the page titles - foreach( $res as $row ) { + foreach( self::$pageTitles as $row ) { self::newTarget( $row->page_namespace, $row->page_title ); // split the page content by [[...]] groups @@ -286,6 +242,69 @@ class Extension { return true; } + // Fetches the page titles from the database. + // @param $currentNamespace String holding the namespace of the page currently being processed. + private static function fetchPageTitles( $currentNamespace ) { + global $wgLinkTitlesPreferShortTitles; + global $wgLinkTitlesMinimumTitleLength; + global $wgLinkTitlesBlackList; + global $wgLinkTitlesNamespaces; + + ( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC'; + // Build a blacklist of pages that are not supposed to be link + // targets. This includes the current page. + $blackList = str_replace( ' ', '_', + '("' . implode( '","',$wgLinkTitlesBlackList ) . '","' . + addslashes( self::$currentTitle->getDbKey() ) . '")' ); + + // Build our weight list. Make sure current namespace is first element + $namespaces = array_diff( $wgLinkTitlesNamespaces, [ $currentNamespace ] ); + array_unshift( $namespaces, $currentNamespace ); + + // No need for sanitiy check. we are sure that we have at least one element in the array + $weightSelect = "CASE page_namespace "; + $currentWeight = 0; + foreach ($namespaces as &$namspacevalue) { + $currentWeight = $currentWeight + 100; + $weightSelect = $weightSelect . " WHEN " . $namspacevalue . " THEN " . $currentWeight . PHP_EOL; + } + $weightSelect = $weightSelect . " END "; + $namespacesClause = '(' . implode( ', ', $namespaces ) . ')'; + + // Build an SQL query and fetch all page titles ordered by length from + // shortest to longest. Only titles from 'normal' pages (namespace uid + // = 0) are returned. Since the db may be sqlite, we need a try..catch + // structure because sqlite does not support the CHAR_LENGTH function. + $dbr = wfGetDB( DB_SLAVE ); + try { + $res = $dbr->select( + 'page', + array( 'page_title', 'page_namespace' , "weight" => $weightSelect), + array( + 'page_namespace IN ' . $namespacesClause, + 'CHAR_LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength, + 'page_title NOT IN ' . $blackList, + ), + __METHOD__, + array( 'ORDER BY' => 'weight ASC, CHAR_LENGTH(page_title) ' . $sort_order ) + ); + } catch (Exception $e) { + $res = $dbr->select( + 'page', + array( 'page_title', 'page_namespace' , "weight" => $weightSelect ), + array( + 'page_namespace IN ' . $namespacesClause, + 'LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength, + 'page_title NOT IN ' . $blackList, + ), + __METHOD__, + array( 'ORDER BY' => 'weight ASC, LENGTH(page_title) ' . $sort_order ) + ); + } + + return $res; + } + // Build an anonymous callback function to be used in simple mode. private static function simpleModeCallback( array $matches ) { if ( self::checkTargetPage() ) { From 610c221211ebc3134e234a51901387f67003f820 Mon Sep 17 00:00:00 2001 From: Daniel Kraus Date: Mon, 2 Jan 2017 13:42:34 +0100 Subject: [PATCH 2/2] Fix preventing linking to self. --- includes/LinkTitles_Extension.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/includes/LinkTitles_Extension.php b/includes/LinkTitles_Extension.php index 77de25e..b95c77b 100644 --- a/includes/LinkTitles_Extension.php +++ b/includes/LinkTitles_Extension.php @@ -141,6 +141,9 @@ class Extension { foreach( self::$pageTitles as $row ) { self::newTarget( $row->page_namespace, $row->page_title ); + // Don't link current page + if ( self::$targetTitle->equals( self::$currentTitle ) ) { continue; } + // split the page content by [[...]] groups // credits to inhan @ StackOverflow for suggesting preg_split // see http://stackoverflow.com/questions/10672286 @@ -253,9 +256,7 @@ class Extension { ( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC'; // Build a blacklist of pages that are not supposed to be link // targets. This includes the current page. - $blackList = str_replace( ' ', '_', - '("' . implode( '","',$wgLinkTitlesBlackList ) . '","' . - addslashes( self::$currentTitle->getDbKey() ) . '")' ); + $blackList = str_replace( ' ', '_', '("' . implode( '","',$wgLinkTitlesBlackList ) . '")' ); // Build our weight list. Make sure current namespace is first element $namespaces = array_diff( $wgLinkTitlesNamespaces, [ $currentNamespace ] );