Use preg_replace_callback throughout; lazy checks.

The checks for whether a page is a potential link target or not (depending
on the absence of the __NOAUTOLINKTARGET__ magic word and if it is not a
redirect to the current page) have now been moved into the callback
functions, so that they are only performed if a page really is a candidate
for linking (i.e, its title occurs on the currently edited page).

The change resulted in a ~10-fold increase in speed.
This commit is contained in:
Daniel Kraus
2014-06-10 16:22:28 +02:00
parent 4ca1225fd0
commit d7571c4922

View File

@ -29,6 +29,17 @@
/// Central class of the extension. Sets up parser hooks. /// Central class of the extension. Sets up parser hooks.
/// This class contains only static functions; do not instantiate. /// This class contains only static functions; do not instantiate.
class LinkTitles { class LinkTitles {
/// A Title object for the page that is being parsed.
private static $mCurrentTitle;
/// A Title object for the target page currently being examined.
private static $mTargetTitle;
/// The content object for the currently processed target page.
/// This variable is necessary to be able to prevent loading the target
/// content twice.
private static $mTargetContent;
/// Setup function, hooks the extension's functions to MediaWiki events. /// Setup function, hooks the extension's functions to MediaWiki events.
public static function setup() { public static function setup() {
global $wgLinkTitlesParseOnEdit; global $wgLinkTitlesParseOnEdit;
@ -74,6 +85,7 @@
/// @param $content Content object that holds the article content /// @param $content Content object that holds the article content
/// @returns true /// @returns true
static function parseContent( &$article, &$content ) { static function parseContent( &$article, &$content ) {
wfProfileIn( __METHOD__ );
// If the page contains the magic word '__NOAUTOLINKS__', do not parse // If the page contains the magic word '__NOAUTOLINKS__', do not parse
// the content. // the content.
@ -93,17 +105,9 @@
global $wgLinkTitlesWordEndOnly; global $wgLinkTitlesWordEndOnly;
global $wgLinkTitlesSmartMode; global $wgLinkTitlesSmartMode;
global $wgCapitalLinks; global $wgCapitalLinks;
global $wgLinkTitlesEnableNoTargetMagicWord;
global $wgLinkTitlesCheckRedirect;
( $wgLinkTitlesWordStartOnly ) ? $wordStartDelim = '\b' : $wordStartDelim = ''; ( $wgLinkTitlesWordStartOnly ) ? $wordStartDelim = '\b' : $wordStartDelim = '';
( $wgLinkTitlesWordEndOnly ) ? $wordEndDelim = '\b' : $wordEndDelim = ''; ( $wgLinkTitlesWordEndOnly ) ? $wordEndDelim = '\b' : $wordEndDelim = '';
// ( $wgLinkTitlesIgnoreCase ) ? $regexModifier = 'i' : $regexModifier = '';
// To prevent adding self-references, we now
// extract the current page's title.
$myTitle = $article->getTitle();
$myTitleText = $myTitle->GetText();
( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC'; ( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC';
( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1; ( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1;
@ -115,6 +119,10 @@
$templatesDelimiter = '{{[^|]+?}}|{{.+\||'; $templatesDelimiter = '{{[^|]+?}}|{{.+\||';
}; };
LinkTitles::$mCurrentTitle = $article->getTitle();
$text = $content->getContentHandler()->serializeContent($content);
$newText = $text;
// Build a regular expression that will capture existing wiki links ("[[...]]"), // Build a regular expression that will capture existing wiki links ("[[...]]"),
// wiki headings ("= ... =", "== ... ==" etc.), // wiki headings ("= ... =", "== ... ==" etc.),
// urls ("http://example.com", "[http://example.com]", "[http://example.com Description]", // urls ("http://example.com", "[http://example.com]", "[http://example.com Description]",
@ -143,8 +151,7 @@
// targets. This includes the current page. // targets. This includes the current page.
$black_list = str_replace( '_', ' ', $black_list = str_replace( '_', ' ',
'("' . implode( '", "',$wgLinkTitlesBlackList ) . '("' . implode( '", "',$wgLinkTitlesBlackList ) .
$myTitle->getDbKey() . '")' ); LinkTitles::$mCurrentTitle->getDbKey() . '")' );
// Build an SQL query and fetch all page titles ordered by length from // Build an SQL query and fetch all page titles ordered by length from
// shortest to longest. Only titles from 'normal' pages (namespace uid // shortest to longest. Only titles from 'normal' pages (namespace uid
@ -177,59 +184,30 @@
); );
} }
$text = $content->getContentHandler()->serializeContent($content); // Build an anonymous callback function to be used in simple mode.
$simpleModeCallback = function( $matches ) {
if ( LinkTitles::checkTargetPage() ) {
return '[[' . $matches[0] . ']]';
}
else
{
return $matches[0];
}
};
// Iterate through the page titles // Iterate through the page titles
wfProfileIn('LinkTitles::parseContent-row_iterator');
foreach( $res as $row ) { foreach( $res as $row ) {
// Obtain an instance of a Title class for the current database row. LinkTitles::newTarget( $row->page_title );
$targetTitle = Title::makeTitle(NS_MAIN, $row->page_title);
if ( $wgLinkTitlesCheckRedirect || $wgLinkTitlesEnableNoTargetMagicWord ) {
// Obtain a page object for the current title, so we can check for
// the presence of the __NOAUTOLINKTARGET__ magic keyword.
$targetPageContent = WikiPage::factory($targetTitle)->getContent();
// To prevent linking to pages that redirect to the current page,
// obtain the title that the target page redirects to. Will be null
// if there is no redirect.
if ( $wgLinkTitlesCheckRedirect ) {
$redirectTitle = $targetPageContent->getUltimateRedirectTarget();
$redirectCheck = !( $redirectTitle && $redirectTitle->equals($myTitle) );
}
else
{
$redirectCheck = true;
};
if ( $wgLinkTitlesEnableNoTargetMagicWord ) {
$magicWordCheck = ! $targetPageContent->matchMagicWord(
MagicWord::get('MAG_LINKTITLES_NOTARGET') );
}
else
{
$magicWordCheck = true;
};
}
else
{
$redirectCheck = true;
$magicWordCheck = true;
}
// Proceed only if the currently examined page does not redirect to
// our page and does not contain the no-target magic word.
// If the corresponding configuration variables are set to false,
// both 'check' variables below will be set to true by the code
// above.
if ( $redirectCheck && $magicWordCheck ) {
// split the page content by [[...]] groups // split the page content by [[...]] groups
// credits to inhan @ StackOverflow for suggesting preg_split // credits to inhan @ StackOverflow for suggesting preg_split
// see http://stackoverflow.com/questions/10672286 // see http://stackoverflow.com/questions/10672286
$arr = preg_split( $delimiter, $text, -1, PREG_SPLIT_DELIM_CAPTURE ); $arr = preg_split( $delimiter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE );
// Escape certain special characters in the page title to prevent // Escape certain special characters in the page title to prevent
// regexp compilation errors // regexp compilation errors
$targetTitleText = $targetTitle->getText(); $targetTitleText = LinkTitles::$mTargetTitle->getText();
$quotedTitle = preg_quote($targetTitleText, '/'); $quotedTitle = preg_quote($targetTitleText, '/');
// Depending on the global configuration setting $wgCapitalLinks, // Depending on the global configuration setting $wgCapitalLinks,
@ -245,9 +223,9 @@
for ( $i = 0; $i < count( $arr ); $i+=2 ) { for ( $i = 0; $i < count( $arr ); $i+=2 ) {
// even indexes will point to text that is not enclosed by brackets // even indexes will point to text that is not enclosed by brackets
$arr[$i] = preg_replace( '/(?<![\:\.\@\/\?\&])' . $arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' .
$wordStartDelim . $searchTerm . $wordEndDelim . '/', $wordStartDelim . $searchTerm . $wordEndDelim . '/',
'[[$1]]', $arr[$i], $limit, $count ); $simpleModeCallback, $arr[$i], $limit, $count );
if (( $limit >= 0 ) && ( $count > 0 )) { if (( $limit >= 0 ) && ( $count > 0 )) {
break; break;
}; };
@ -273,6 +251,7 @@
// we need to ignore the first letter of the page titles, as // we need to ignore the first letter of the page titles, as
// it does not matter for linking. // it does not matter for linking.
$callback = function ($matches) use ($targetTitleText) { $callback = function ($matches) use ($targetTitleText) {
if ( LinkTitles::checkTargetPage() ) {
if ( strcmp(substr($targetTitleText, 1), substr($matches[0], 1)) == 0 ) { if ( strcmp(substr($targetTitleText, 1), substr($matches[0], 1)) == 0 ) {
// Case-sensitive match: no need to bulid piped link. // Case-sensitive match: no need to bulid piped link.
return '[[' . $matches[0] . ']]'; return '[[' . $matches[0] . ']]';
@ -280,6 +259,11 @@
// Case-insensitive match: build piped link. // Case-insensitive match: build piped link.
return '[[' . $targetTitleText . '|' . $matches[0] . ']]'; return '[[' . $targetTitleText . '|' . $matches[0] . ']]';
} }
}
else
{
return $matches[0];
}
}; };
} }
else else
@ -287,6 +271,7 @@
// If $wgCapitalLinks is false, we can use the simple variant // If $wgCapitalLinks is false, we can use the simple variant
// of the callback function. // of the callback function.
$callback = function ($matches) use ($targetTitleText) { $callback = function ($matches) use ($targetTitleText) {
if ( LinkTitles::checkTargetPage() ) {
if ( strcmp($targetTitleText, $matches[0]) == 0 ) { if ( strcmp($targetTitleText, $matches[0]) == 0 ) {
// Case-sensitive match: no need to bulid piped link. // Case-sensitive match: no need to bulid piped link.
return '[[' . $matches[0] . ']]'; return '[[' . $matches[0] . ']]';
@ -294,6 +279,11 @@
// Case-insensitive match: build piped link. // Case-insensitive match: build piped link.
return '[[' . $targetTitleText . '|' . $matches[0] . ']]'; return '[[' . $targetTitleText . '|' . $matches[0] . ']]';
} }
}
else
{
return $matches[0];
}
}; };
} }
@ -309,12 +299,13 @@
}; };
}; };
$newText = implode( '', $arr ); $newText = implode( '', $arr );
} // $wgLinkTitlesSmartMode
}; // foreach $res as $row
wfProfileOut('LinkTitles::parseContent-row_iterator');
if ( $newText != $text ) { if ( $newText != $text ) {
$content = $content->getContentHandler()->unserializeContent( $newText ); $content = $content->getContentHandler()->unserializeContent( $newText );
} }
} // $wgLinkTitlesSmartMode wfProfileOut( __METHOD__ );
}
}; // foreach $res as $row
return true; return true;
} }
@ -354,6 +345,63 @@
$mwa->matchAndRemove( $text ); $mwa->matchAndRemove( $text );
return true; return true;
} }
/// Sets member variables for the current target page.
private function newTarget( $titleString ) {
// @todo Make this wiki namespace aware.
LinkTitles::$mTargetTitle = Title::makeTitle( NS_MAIN, $titleString );
LinkTitles::$mTargetContent = null;
}
/// Returns the content of the current target page.
/// This function serves to be used in preg_replace_callback callback
/// functions, in order to load the target page content from the
/// database only when needed.
/// @note It is absolutely necessary that the newTarget()
/// function is called for every new page.
private function getTargetContent() {
if ( ! isset( $mTargetContent ) ) {
LinkTitles::$mTargetContent = WikiPage::factory(
LinkTitles::$mTargetTitle)->getContent();
};
return LinkTitles::$mTargetContent;
}
/// Examines the current target page. Returns true if it may be linked;
/// false if not. This depends on the settings
/// $wgLinkTitlesCheckRedirect and $wgLinkTitlesEnableNoTargetMagicWord
/// and whether the target page is a redirect or contains the
/// __NOAUTOLINKTARGET__ magic word.
/// @returns boolean
private function checkTargetPage() {
wfProfileIn( __METHOD__ );
global $wgLinkTitlesEnableNoTargetMagicWord;
global $wgLinkTitlesCheckRedirect;
// If checking for redirects is enabled and the target page does
// indeed redirect to the current page, return the page title as-is
// (unlinked).
if ( $wgLinkTitlesCheckRedirect ) {
$redirectTitle = LinkTitles::getTargetContent()->getUltimateRedirectTarget();
if ( $redirectTitle && $redirectTitle->equals(LinkTitles::$mCurrentTitle) ) {
wfProfileOut( __METHOD__ );
return false;
}
};
// If the magic word __NOAUTOLINKTARGET__ is enabled and the target
// page does indeed contain this magic word, return the page title
// as-is (unlinked).
if ( $wgLinkTitlesEnableNoTargetMagicWord ) {
if ( LinkTitles::getTargetContent()->matchMagicWord(
MagicWord::get('MAG_LINKTITLES_NOTARGET') ) ) {
wfProfileOut( __METHOD__ );
return false;
}
};
wfProfileOut( __METHOD__ );
return true;
}
} }
// vim: ts=2:sw=2:noet:comments^=\:/// // vim: ts=2:sw=2:noet:comments^=\:///