From 35b174771eca494a7181d543afc37062043754ae Mon Sep 17 00:00:00 2001 From: Daniel Kraus Date: Sun, 27 Aug 2017 22:18:22 +0200 Subject: [PATCH] Refactor Linker class, add Target class. --- extension.json | 1 + includes/Config.php | 20 ++++ includes/Delimiters.php | 62 +++++----- includes/Linker.php | 245 ++++++++++++---------------------------- includes/Target.php | 200 ++++++++++++++++++++++++++++++++ 5 files changed, 328 insertions(+), 200 deletions(-) create mode 100644 includes/Target.php diff --git a/extension.json b/extension.json index 32f2a4d..3093b7f 100644 --- a/extension.json +++ b/extension.json @@ -35,6 +35,7 @@ "AutoloadClasses": { "LinkTitles\\Extension": "includes/Extension.php", "LinkTitles\\Linker": "includes/Linker.php", + "LinkTitles\\Target": "includes/Target.php", "LinkTitles\\Targets": "includes/Targets.php", "LinkTitles\\Delimiters": "includes/Delimiters.php", "LinkTitles\\Config": "includes/Config.php", diff --git a/includes/Config.php b/includes/Config.php index e975c66..9200060 100644 --- a/includes/Config.php +++ b/includes/Config.php @@ -133,6 +133,22 @@ class Config { */ public $parseHeadings; + /** + * Whether to check if a potential target page links back to the source page. + * Set this to true to avoid indirect linkbacks. + * + * @var bool $checkRedirect + */ + public $checkRedirect; + + /** + * Whether to enable the __NOAUTOLINKTARGET__ magic word which prevents + * a potential target page from being linked to. + * + * @var bool $enableNoTargetMagicWord + */ + public $enableNoTargetMagicWord; + public $enableConsoleOutput; public $enableDebugConsoleOutput; @@ -156,6 +172,8 @@ class Config { global $wgLinkTitlesWordEndOnly; global $wgLinkTitlesSkipTemplates; global $wgLinkTitlesParseHeadings; + global $wgLinkTitlesEnableNoTargetMagicWord; + global $wgLinkTitlesCheckRedirect; $this->parseOnEdit = $wgLinkTitlesParseOnEdit; $this->parseOnRender = $wgLinkTitlesParseOnRender; $this->preferShortTitles = $wgLinkTitlesPreferShortTitles; @@ -169,6 +187,8 @@ class Config { $this->wordEndOnly = $wgLinkTitlesWordEndOnly; $this->skipTemplates = $wgLinkTitlesSkipTemplates; $this->parseHeadings = $wgLinkTitlesParseHeadings; + $this->enableNoTargetMagicWord = $wgLinkTitlesEnableNoTargetMagicWord;; + $this->checkRedirect = $wgLinkTitlesCheckRedirect;; $this->enableConsoleOutput = false; $this->enableDebugConsoleOutput = false; } diff --git a/includes/Delimiters.php b/includes/Delimiters.php index d6b513d..0338325 100644 --- a/includes/Delimiters.php +++ b/includes/Delimiters.php @@ -27,15 +27,36 @@ namespace LinkTitles; * Caches a regular expression that delimits text to be parsed. */ class Delimiters { + /** + * The splitting expression that separates text to be parsed from text that + * must not be parsed. + * @var String $splitter + */ + public $splitter; + + /** + * The LinkTitles configuration for this Delimiters instance. + * @var Config $config + */ + public $config; + private static $instance; /** - * Singleton factory. + * Gets the Delimiters singleton; may build one with the given config or the + * default config if none is given. * - * @param Config $config LinkTitles configuration. + * If the instance was already created, it does not matter what Config this + * method is called with. To re-create an instance with a different Config, + * call Delimiters::invalidate() first. + * + * @param Config|null $config LinkTitles configuration. */ - public static function default( Config $config ) { + public static function default( Config &$config = null ) { if ( self::$instance === null ) { + if ( $config === null ) { + $config = new Config(); + } self::$instance = new Delimiters( $config ); } return self::$instance; @@ -56,27 +77,15 @@ class Delimiters { } /** - * The splitting expression that separates text to be parsed from text that - * must not be parsed. - * @var String $splitter + * Splits a text into sections that may be linked and sections that may not + * be linked (e.g., because they already are a link, or a template, etc.). + * + * @param String &$text Text to split. + * @return Array of strings where even indexes point to linkable sections. */ - public $splitter; - - /** - * Regex that matches the start of a word; this expression depends on the - * setting of LinkTitles\Config->wordStartOnly; - * @var String $wordStart - */ - public $wordStart; - - /** - * Regex that matches the end of a word; this expression depends on the - * setting of LinkTitles\Config->wordEndOnly; - * @var String $wordEnd - */ - public $wordEnd; - - private $config; + public function split( &$text ) { + return preg_split( $this->splitter, $text, -1, PREG_SPLIT_DELIM_CAPTURE ); + } /* * Builds the delimiter that is used in a regexp to separate @@ -84,13 +93,6 @@ class Delimiters { * parsed (e.g. inside existing links etc.) */ private function buildDelimiters() { - // Use unicode character properties rather than \b escape sequences - // to detect whole words containing non-ASCII characters as well. - // Note that this requires a PCRE library that was compiled with - // --enable-unicode-properties - ( $this->config->wordStartOnly ) ? $this->wordStart = '(?wordStart = ''; - ( $this->config->wordEndOnly ) ? $this->wordEnd = '(?!\pL)' : $this->wordEnd = ''; - if ( $this->config->skipTemplates ) { // Use recursive regex to balance curly braces; diff --git a/includes/Linker.php b/includes/Linker.php index 8ebafa2..9e4806a 100644 --- a/includes/Linker.php +++ b/includes/Linker.php @@ -27,24 +27,6 @@ namespace LinkTitles; * Performs the actual linking of content to existing pages. */ class Linker { - /// A Title object for the page that is being parsed. - private $currentTitle; - - /// A Title object for the target page currently being examined. - private $targetTitle; - - // The TitleValue object of the target page - private $targetTitleValue; - - /// The content object for the currently processed target page. - /// This variable is necessary to be able to prevent loading the target - /// content twice. - private $targetContent; - - /// Holds the page title of the currently processed target page - /// as a string. - private $targetTitleText; - /** * LinkTitles configuration. * @@ -52,6 +34,17 @@ class Linker { */ public $config; + /** + * The string representation of the title object for the potential target page + * that is currently being processed. + * + * This is an instance variable (rather than a local method variable) so it + * can be accessed in the preg_replace_callback callbacks. + * + * @var String $targetTitleString + */ + private $targetTitleText; + /** * Constructs a new instance of the Linker class. * @@ -61,58 +54,50 @@ class Linker { $this->config = $config; } - /* + /** * Core function of the extension, performs the actual parsing of the content. * - * @param Parser $parser Parser instance for the current page + * This method receives a Title object and the string representation of the + * source page. It does not work on a WikiPage object directly because the + * callbacks in the Extension class do not always get a WikiPage object in the + * first place. + * + * @param \Title &$title Title object for the current page. * @param String $text String that holds the article content - * @returns String String with links to target pages + * @return String with links to target pages */ public function linkContent( \Title &$title, &$text ) { ( $this->config->firstOnly ) ? $limit = 1 : $limit = -1; $limitReached = false; - $this->currentTitle = $title; $newText = $text; $delimiters = Delimiters::default( $this->config ); $targets = Targets::default( $title, $this->config ); - // Iterate through the page titles + // Iterate through the target page titles foreach( $targets->queryResult as $row ) { - $this->newTarget( $row->page_namespace, $row->page_title ); + $target = new Target( $row->page_namespace, $row->page_title, $this->config ); - // Don't link current page - if ( $this->targetTitle->equals( $this->currentTitle ) ) { continue; } - - // split the page content by [[...]] groups - // credits to inhan @ StackOverflow for suggesting preg_split - // see http://stackoverflow.com/questions/10672286 - $arr = preg_split( $delimiters->splitter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE ); - - // Escape certain special characters in the page title to prevent - // regexp compilation errors - $this->targetTitleText = $this->targetTitle->getText(); - $quotedTitle = preg_quote( $this->targetTitleText, '/' ); - - $this->ltDebugLog( 'TargetTitle='. $this->targetTitleText, 'private' ); - $this->ltDebugLog( 'TargetTitleQuoted='. $quotedTitle, 'private' ); - - // Depending on the global configuration setting $wgCapitalLinks, - // the title has to be searched for either in a strictly case-sensitive - // way, or in a 'fuzzy' way where the first letter of the title may - // be either case. - if ( $this->config->capitalLinks && ( $quotedTitle[0] != '\\' )) { - $searchTerm = '((?i)' . $quotedTitle[0] . '(?-i)' . - substr($quotedTitle, 1) . ')'; - } else { - $searchTerm = '(' . $quotedTitle . ')'; + // Don't link current page and don't link if the target page redirects + // to the current page or has the __NOAUTOLINKTARGET__ magic word + // (as required by the actual LinkTitles configuration). + if ( $target->isSameTitle( $title ) || !$target->mayLinkTo( $title ) ) { + continue; } - $regex = '/(?wordStart . $searchTerm . $delimiters->wordEnd . '/S'; - for ( $i = 0; $i < count( $arr ); $i+=2 ) { - // even indexes will point to text that is not enclosed by brackets - $arr[$i] = preg_replace_callback( $regex, + // Split the page content by non-linkable sections. + // Credits to inhan @ StackOverflow for suggesting preg_split. + // See http://stackoverflow.com/questions/10672286 + $arr = $delimiters->split( $newText ); + $count = 0; + + // Cache the target title text for the regex callbacks + $this->targetTitleText = $target->getTitleText(); + + // Even indexes will point to sections of the text that may be linked + for ( $i = 0; $i < count( $arr ); $i += 2 ) { + $arr[$i] = preg_replace_callback( $target->getCaseSensitiveRegex(), array( $this, 'simpleModeCallback'), $arr[$i], $limit, $count ); if ( $this->config->firstOnly && ( $count > 0 ) ) { @@ -126,13 +111,14 @@ class Linker { // pass on the page and add links with aliases where the case does // not match. if ( $this->config->smartMode && !$limitReached ) { - $arr = preg_split( $delimiters->splitter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE ); + if ( $count > 0 ) { + // Split the text again because it was changed in the first pass. + $arr = $delimiters->split( $newText ); + } for ( $i = 0; $i < count( $arr ); $i+=2 ) { // even indexes will point to text that is not enclosed by brackets - $arr[$i] = preg_replace_callback( '/(?wordStart . '(' . $quotedTitle . ')' . - $delimiters->wordEnd . '/iS', + $arr[$i] = preg_replace_callback( $target->getCaseInsensitiveRegex(), array( $this, 'smartModeCallback'), $arr[$i], $limit, $count ); if ( $this->config->firstOnly && ( $count > 0 )) { @@ -142,138 +128,57 @@ class Linker { $newText = implode( '', $arr ); } // $wgLinkTitlesSmartMode }; // foreach $res as $row + return $newText; } - // Build an anonymous callback function to be used in simple mode. + /** + * Callback for preg_replace_callback in simple mode. + * + * @param array $matches Matches provided by preg_replace_callback + * @return string Target page title with or without link markup + */ private function simpleModeCallback( array $matches ) { - if ( $this->checkTargetPage() ) { - $this->ltLog( "Linking '$matches[0]' to '" . $this->targetTitle . "'" ); - return '[[' . $matches[0] . ']]'; - } - else - { - return $matches[0]; - } + return '[[' . $matches[0] . ']]'; } - // Callback function for use with preg_replace_callback. - // This essentially performs a case-sensitive comparison of the - // current page title and the occurrence found on the page; if - // the cases do not match, it builds an aliased (piped) link. - // If $wgCapitalLinks is set to true, the case of the first - // letter is ignored by MediaWiki and we don't need to build a - // piped link if only the case of the first letter is different. + /** + * Callback function for use with preg_replace_callback. + * This essentially performs a case-sensitive comparison of the + * current page title and the occurrence found on the page; if + * the cases do not match, it builds an aliased (piped) link. + * If $wgCapitalLinks is set to true, the case of the first + * letter is ignored by MediaWiki and we don't need to build a + * piped link if only the case of the first letter is different. + * + * @param array $matches Matches provided by preg_replace_callback + * @return string Target page title with or without link markup + */ private function smartModeCallback( array $matches ) { - if ( $this->config->capitalLinks ) { // With $wgCapitalLinks set to true we have a slightly more // complicated version of the callback than if it were false; // we need to ignore the first letter of the page titles, as // it does not matter for linking. - if ( $this->checkTargetPage() ) { - $this->ltLog( "Linking (smart) '$matches[0]' to '" . $this->targetTitle . "'" ); - if ( strcmp(substr($this->targetTitleText, 1), substr($matches[0], 1)) == 0 ) { - // Case-sensitive match: no need to bulid piped link. - return '[[' . $matches[0] . ']]'; - } else { - // Case-insensitive match: build piped link. - return '[[' . $this->targetTitleText . '|' . $matches[0] . ']]'; - } - } - else - { - return $matches[0]; + if ( strcmp( substr( $this->targetTitleText, 1 ), substr( $matches[ 0 ], 1) ) == 0 ) { + // Case-sensitive match: no need to bulid piped link. + return '[[' . $matches[ 0 ] . ']]'; + } else { + // Case-insensitive match: build piped link. + return '[[' . $this->targetTitleText . '|' . $matches[ 0 ] . ']]'; } } else { // If $wgCapitalLinks is false, we can use the simple variant // of the callback function. - if ( $this->checkTargetPage() ) { - $this->ltLog( "Linking (smart) '$matches[0]' to '" . $this->targetTitle . "'" ); - if ( strcmp($this->targetTitleText, $matches[0]) == 0 ) { - // Case-sensitive match: no need to bulid piped link. - return '[[' . $matches[0] . ']]'; - } else { - // Case-insensitive match: build piped link. - return '[[' . $this->targetTitleText . '|' . $matches[0] . ']]'; - } - } - else - { - return $matches[0]; + if ( strcmp( $this->targetTitleText, $matches[ 0 ] ) == 0 ) { + // Case-sensitive match: no need to bulid piped link. + return '[[' . $matches[ 0 ] . ']]'; + } else { + // Case-insensitive match: build piped link. + return '[[' . $this->targetTitleText . '|' . $matches[ 0 ] . ']]'; } } } - - /// Sets member variables for the current target page. - private function newTarget( $ns, $title ) { - $this->targetTitle = \Title::makeTitleSafe( $ns, $title ); - $this->ltDebugLog( 'newtarget='. $this->targetTitle->getText(), "private" ); - $this->targetTitleValue = $this->targetTitle->getTitleValue(); - $this->ltDebugLog( 'altTarget='. $this->targetTitleValue->getText(), "private" ); - $this->targetContent = null; - } - - /// Returns the content of the current target page. - /// This function serves to be used in preg_replace_callback callback - /// functions, in order to load the target page content from the - /// database only when needed. - /// @note It is absolutely necessary that the newTarget() - /// function is called for every new page. - private function getTargetContent() { - if ( ! isset( $targetContent ) ) { - $this->targetContent = \WikiPage::factory( $this->targetTitle )->getContent(); - }; - return $this->targetContent; - } - - /// Examines the current target page. Returns true if it may be linked; - /// false if not. This depends on the settings - /// $wgLinkTitlesCheckRedirect and $wgLinkTitlesEnableNoTargetMagicWord - /// and whether the target page is a redirect or contains the - /// __NOAUTOLINKTARGET__ magic word. - /// @returns boolean - private function checkTargetPage() { - global $wgLinkTitlesEnableNoTargetMagicWord; - global $wgLinkTitlesCheckRedirect; - - // If checking for redirects is enabled and the target page does - // indeed redirect to the current page, return the page title as-is - // (unlinked). - if ( $wgLinkTitlesCheckRedirect ) { - $redirectTitle = $this->getTargetContent()->getUltimateRedirectTarget(); - if ( $redirectTitle && $redirectTitle->equals($this->currentTitle) ) { - return false; - } - }; - - // If the magic word __NOAUTOLINKTARGET__ is enabled and the target - // page does indeed contain this magic word, return the page title - // as-is (unlinked). - if ( $wgLinkTitlesEnableNoTargetMagicWord ) { - if ( $this->getTargetContent()->matchMagicWord( - \MagicWord::get('MAG_LINKTITLES_NOTARGET') ) ) { - return false; - } - }; - return true; - } - - /// Local Debugging output function which can send output to console as well - public function ltDebugLog($text) { - if ( $this->config->enableDebugConsoleOutput ) { - print $text . "\n"; - } - wfDebugLog( 'LinkTitles', $text , 'private' ); - } - - /// Local Logging output function which can send output to console as well - public function ltLog($text) { - if ( $this->config->enableConsoleOutput) { - print $text . "\n"; - } - wfDebugLog( 'LinkTitles', $text , 'private' ); - } } // vim: ts=2:sw=2:noet:comments^=\:/// diff --git a/includes/Target.php b/includes/Target.php new file mode 100644 index 0000000..f2457d8 --- /dev/null +++ b/includes/Target.php @@ -0,0 +1,200 @@ + ('bovender') + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + * + * @author Daniel Kraus + */ +namespace LinkTitles; + +/** + * Represents a page that is a potential link target. + */ +class Target { + /** + * \TitleValue object for the current target page title. + * @var \TitleValue $titleValue; + */ + public $titleValue; + + /** + * Regex that matches the start of a word; this expression depends on the + * setting of LinkTitles\Config->wordStartOnly; + * @var String $wordStart + */ + public $wordStart; + + /** + * Regex that matches the end of a word; this expression depends on the + * setting of LinkTitles\Config->wordEndOnly; + * @var String $wordEnd + */ + public $wordEnd; + + /** + * A Title object for the target page currently being examined. + * @var \Title $title + */ + private $title; + + /** + * Caches the target page content as a \Content object. + * + * @var \Content $content + */ + private $content; + + /** + * LinkTitles configuration. + * @var Config $config + */ + private $config; + + /** + * Constructs a new Target object + * + * The parameters may be taken from database rows, for example. + * + * @param Int $nameSpace Name space of the target page + * @param String &$title Title of the target page + */ + public function __construct( $nameSpace, &$title, Config &$config ) { + $this->title = \Title::makeTitleSafe( $nameSpace, $title ); + $this->titleValue = $this->title->getTitleValue(); + $this->config = $config; + + // Use unicode character properties rather than \b escape sequences + // to detect whole words containing non-ASCII characters as well. + // Note that this requires a PCRE library that was compiled with + // --enable-unicode-properties + ( $config->wordStartOnly ) ? $this->wordStart = '(?wordStart = ''; + ( $config->wordEndOnly ) ? $this->wordEnd = '(?!\pL)' : $this->wordEnd = ''; + } + + /** + * Gets the string representation of the target title. + * @return String title text + */ + public function getTitleText() { + return $this->title->getText(); + } + + /** + * Gets the title string with certain characters escaped that may interfere + * with regular expressions. + * @return String representation of the title, regex-safe + */ + public function getRegexSafeTitle() { + return preg_quote( $this->title->getText(), '/' ); + } + + /** + * Builds a regular expression of the title + * @return String regular expression for this title. + */ + public function getCaseSensitiveRegex() { + $regexSafeTitle = $this->getRegexSafeTitle(); + + // Depending on the $config->capitalLinks setting, + // the title has to be searched for either in a strictly case-sensitive + // way, or in a 'fuzzy' way where the first letter of the title may + // be either case. + // + if ( $this->config->capitalLinks && ( $regexSafeTitle[0] != '\\' )) { + $searchTerm = '((?i)' . $regexSafeTitle[0] . '(?-i)' . substr($regexSafeTitle, 1) . ')'; + } else { + $searchTerm = '(' . $regexSafeTitle . ')'; + } + + return $this->buildRegex( $searchTerm ); + } + + /** + * Builds a regular expression pattern for the title in a case-insensitive + * way. + * @return String case-insensitive regular expression pattern for the title + */ + public function getCaseInsensitiveRegex() { + return $this->buildRegex( $this->getRegexSafeTitle() ) . 'i'; + } + + /** + * Builds the basic regex that is used to match target page titles in a source + * text. + * @param String $searchTerm Target page title (special characters must be quoted) + * @return String regular expression pattern + */ + private function buildRegex( $searchTerm ) { + return '/(?wordStart . $searchTerm . $this->wordEnd . '/S'; + } + + /** + * Returns the \Content of the target page. + * + * The value is cached. + * @return \Content Content of the Target page. + */ + public function getContent() { + if ( $this->content === null ) { + $this->content = \WikiPage::factory( $this->title )->getContent(); + }; + return $this->content; + } + + /** + * Examines the current target page. Returns true if it may be linked; + * false if not. This depends on two settings: + * $wgLinkTitlesCheckRedirect and $wgLinkTitlesEnableNoTargetMagicWord + * and whether the target page is a redirect or contains the + * __NOAUTOLINKTARGET__ magic word. + * + * @param \Title $fromTitle + * + * @return boolean + */ + public function mayLinkTo( \Title $fromTitle ) { + // If checking for redirects is enabled and the target page does + // indeed redirect to the current page, return the page title as-is + // (unlinked). + if ( $this->config->checkRedirect ) { + $redirectTitle = $this->getContent()->getUltimateRedirectTarget(); + if ( $redirectTitle && $redirectTitle->equals( $fromtitle ) ) { + return false; + } + }; + // If the magic word __NOAUTOLINKTARGET__ is enabled and the target + // page does indeed contain this magic word, return the page title + // as-is (unlinked). + if ( $this->config->enableNoTargetMagicWord ) { + if ( $this->getContent()->matchMagicWord( \MagicWord::get('MAG_LINKTITLES_NOTARGET') ) ) { + return false; + } + }; + return true; + } + + /** + * Determines if the Target's title is the same as another title. + * @param Title $otherTitle Other title + * @return boolean True if the $otherTitle is the same, false if not. + */ + public function isSameTitle( \Title $otherTitle) { + return $this->title->equals( $otherTitle ); + } +}