Merge branch 'unit-tests' into develop

This commit is contained in:
Daniel Kraus
2017-08-27 22:53:12 +02:00
20 changed files with 1701 additions and 822 deletions

1
.atomignore Normal file
View File

@ -0,0 +1 @@
gh-pages/

443
Doxyfile

File diff suppressed because it is too large Load Diff

View File

@ -31,3 +31,66 @@ Contributors
- Daniel Kraus (@bovender), main developer
- Ulrich Strauss (@c0nnex), namespaces
- Brent Laabs (@labster), code review and bug fixes
Testing
-------
Starting from version 4.2.0, LinkTitles finally comes with phpunit tests.
Here's how I set up the testing environment. This may not be the canonical way
to do it. Basic information on testing MediaWiki can be found [here](https://www.mediawiki.org/wiki/Manual:PHP_unit_testing).
The following assumes that you have an instance of MediaWiki running locally
on your development machine. This assumes that you are running Linux (I personally
use Ubuntu).
1. Pull the MediaWiki repository:
cd ~/Code
git clone --depth 1 https://phabricator.wikimedia.org/source/mediawiki.git
2. Install [composer](https://getcomposer.org) locally and fetch the
dependencies (including development dependencies):
Follow the instructions on the [composer download page](https://getcomposer.org/download),
but instead of running `php composer-setup.php`, run:
php composer-setup.php --install-dir=bin --filename=composer
bin/composer install
3. Install phpunit (it was already installed on my Ubuntu system when I began
testing LinkTitles, so I leave it up to you to figure out how to do it).
4. Copy your `LocalSettings.php` over from your local MediaWiki installation
and remove (or comment out) any lines that reference extensions or skins that
you are not going to install to your test environment. For the purposes of
testing the LinkTitles extension, leave the following line in place:
wfLoadExtensions( array( 'LinkTitles' ));
And ensure the settings file contains the following:
$wgShowDBErrorBacktrace = true;
5. Create a symbolic link to your copy of the LinkTitles repository:
cd ~/Code/mediawiki/extensions
ln -s ~/Code/LinkTitles
6. Make sure your local MediaWiki instance is up to date. Otherwise phpunit may
fail and tell you about database problems.
This is because the local database is used as a template for the unit tests.
For example, I initially had MW 1.26 installed on my laptop, but the cloned
repository was MW 1.29.1. It's probably also possible to clone the repository
with a specific version tag which matches your local installation.
7. Run the tests:
cd ~/Code/mediawiki/tests/phpunit
php phpunit.php --group bovender
This will run all tests from the 'bovender' group, i.e. tests for my extensions.
If you linked just the LinkTitles extension in step 5, only this extension
will be tested.

View File

@ -33,8 +33,14 @@
]
},
"AutoloadClasses": {
"LinkTitles\\Extension": "includes/LinkTitles_Extension.php",
"LinkTitles\\Special": "includes/LinkTitles_Special.php"
"LinkTitles\\Extension": "includes/Extension.php",
"LinkTitles\\Linker": "includes/Linker.php",
"LinkTitles\\Target": "includes/Target.php",
"LinkTitles\\Targets": "includes/Targets.php",
"LinkTitles\\Splitter": "includes/Splitter.php",
"LinkTitles\\Config": "includes/Config.php",
"LinkTitles\\Special": "includes/Special.php",
"LinkTitles\\TestCase": "tests/phpunit/TestCase.php"
},
"SpecialPages": {
"LinkTitles": "LinkTitles\\Special"
@ -61,9 +67,8 @@
"LinkTitles\\Extension::onParserFirstCallInit"
]
},
"callback": "LinkTitles\\Extension::setup",
"ExtensionMessagesFiles": {
"LinkTitlesMagic": "includes/LinkTitles_Magic.php"
"LinkTitlesMagic": "includes/Magic.php"
},
"MessagesDirs": {
"LinkTitles": [

196
includes/Config.php Normal file
View File

@ -0,0 +1,196 @@
<?php
/**
* The LinkTitles\Config class holds configuration for the LinkTitles extension.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/**
* Holds LinkTitles configuration.
*
* This class encapsulates the global configuration variables so we do not have
* to pull those globals into scope in the individual LinkTitles classes.
*
* Using a dedicated configuration class also facilitates overriding certain
* options, i.e. in a maintenance script that is invoked with flags from the
* command line.
*
* @since 5.0.0
*/
class Config {
/**
* Whether to add links to a page when the page is edited/saved.
* @var bool $parseOnEdit
*/
public $parseOnEdit;
/**
* Whether to add links to a page when the page is rendered.
* @var bool $parseOnRender
*/
public $parseOnRender;
/**
* Indicates whether to prioritize short over long titles.
* @var bool $preferShortTitles
*/
public $preferShortTitles;
/**
* Minimum length of a page title for it to qualify as a potential link target.
* @var int $minimumTitleLength
*/
public $minimumTitleLength;
/**
* Array of page titles that must never be link targets.
*
* This may be useful to exclude common abbreviations or acronyms from
* automatic linking.
* @var Array $blackList
*/
public $blackList;
/**
* Array of those name spaces (integer constants) whose pages may be linked.
* @var Array $nameSpaces
*/
public $nameSpaces;
/**
* Indicates whether to add a link to the first occurrence of a page title
* only (true), or add links to all occurrences on the source page (false).
* @var bool $firstOnly;
*/
public $firstOnly;
/**
* Indicates whether to operate in smart mode, i.e. link to pages even if the
* case does not match. Without smart mode, pages are linked to only if the
* exact title appears on the source page.
* @var bool $smartMode;
*/
public $smartMode;
/**
* Mirrors the global MediaWiki variable $wgCapitalLinks that indicates
* whether or not page titles are fully case sensitive
* @var bool $capitalLinks;
*/
public $capitalLinks;
/**
* Whether or not to link to pages only if the page title appears at the
* start of a word on the target page (i.e., link 'MediaWiki' to a page
* 'Media', but not to a page 'Wiki').
*
* Set both $wordStartOnly and $wordEndOnly to true to enforce matching
* whole titles.
*
* @var bool $wordStartOnly;
*/
public $wordStartOnly;
/**
* Whether or not to link to pages only if the page title appears at the
* end of a word on the target page (i.e., link 'MediaWiki' to a page
* 'Wiki', but not to a page 'Media').
*
* Set both $wordStartOnly and $wordEndOnly to true to enforce matching
* whole titles.
*
* @var bool $wordEndOnly;
*/
public $wordEndOnly;
/**
* Whether or not to skip templates. If set to true, text inside transclusions
* will not be linked.
* @var bool $skipTemplates
*/
public $skipTemplates;
/**
* Whether or not to parse headings.
* @var bool $parseHeadings
*/
public $parseHeadings;
/**
* Whether to check if a potential target page links back to the source page.
* Set this to true to avoid indirect linkbacks.
*
* @var bool $checkRedirect
*/
public $checkRedirect;
/**
* Whether to enable the __NOAUTOLINKTARGET__ magic word which prevents
* a potential target page from being linked to.
*
* @var bool $enableNoTargetMagicWord
*/
public $enableNoTargetMagicWord;
public $enableConsoleOutput;
public $enableDebugConsoleOutput;
/**
* Constructs a new Config object.
*
* The object's member variables will automatically be set with the values
* from the corresponding global variables.
*/
public function __construct() {
global $wgLinkTitlesParseOnEdit;
global $wgLinkTitlesParseOnRender;
global $wgLinkTitlesPreferShortTitles;
global $wgLinkTitlesMinimumTitleLength;
global $wgLinkTitlesBlackList;
global $wgLinkTitlesNamespaces;
global $wgLinkTitlesFirstOnly;
global $wgLinkTitlesSmartMode;
global $wgCapitalLinks;
global $wgLinkTitlesWordStartOnly;
global $wgLinkTitlesWordEndOnly;
global $wgLinkTitlesSkipTemplates;
global $wgLinkTitlesParseHeadings;
global $wgLinkTitlesEnableNoTargetMagicWord;
global $wgLinkTitlesCheckRedirect;
$this->parseOnEdit = $wgLinkTitlesParseOnEdit;
$this->parseOnRender = $wgLinkTitlesParseOnRender;
$this->preferShortTitles = $wgLinkTitlesPreferShortTitles;
$this->minimumTitleLength = $wgLinkTitlesMinimumTitleLength;
$this->blackList = $wgLinkTitlesBlackList;
$this->nameSpaces = $wgLinkTitlesNamespaces;
$this->firstOnly = $wgLinkTitlesFirstOnly;
$this->smartMode = $wgLinkTitlesSmartMode;
$this->capitalLinks = $wgCapitalLinks; // MediaWiki global variable
$this->wordStartOnly = $wgLinkTitlesWordStartOnly;
$this->wordEndOnly = $wgLinkTitlesWordEndOnly;
$this->skipTemplates = $wgLinkTitlesSkipTemplates;
$this->parseHeadings = $wgLinkTitlesParseHeadings;
$this->enableNoTargetMagicWord = $wgLinkTitlesEnableNoTargetMagicWord;;
$this->checkRedirect = $wgLinkTitlesCheckRedirect;;
$this->enableConsoleOutput = false;
$this->enableDebugConsoleOutput = false;
}
}

147
includes/Extension.php Normal file
View File

@ -0,0 +1,147 @@
<?php
/**
* The LinkTitles\Extension class provides event handlers and entry points for the extension.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/**
* Provides event handlers and entry points for the extension.
*/
class Extension {
/// Event handler that is hooked to the PageContentSave event.
public static function onPageContentSave( &$wikiPage, &$user, &$content, &$summary,
$isMinor, $isWatch, $section, &$flags, &$status ) {
global $wgLinkTitlesParseOnEdit;
global $wgLinkTitlesNamespaces;
if ( !$wgLinkTitlesParseOnEdit ) return true; // TODO: refactor with following if
if ( !$isMinor ) {
$title = $wikiPage->getTitle();
// Only process if page is in one of our namespaces we want to link
// Fixes ugly autolinking of sidebar pages
if ( in_array( $title->getNamespace(), $wgLinkTitlesNamespaces )) {
$text = $content->getContentHandler()->serializeContent( $content );
if ( !\MagicWord::get( 'MAG_LINKTITLES_NOAUTOLINKS' )->match( $text ) ) {
$config = new Config();
$linker = new Linker( $config );
$newText = $linker->linkContent( $title, $text );
if ( $newText != $text ) {
$content = $content->getContentHandler()->unserializeContent( $newText );
}
}
}
};
return true;
}
/*
* Event handler that is hooked to the InternalParseBeforeLinks event.
* @param Parser $parser Parser that raised the event.
* @param $text Preprocessed text of the page
*/
public static function onInternalParseBeforeLinks( \Parser &$parser, &$text ) {
$config = new Config();
if (!$config->parseOnRender) return true;
$title = $parser->getTitle();
// If the page contains the magic word '__NOAUTOLINKS__', do not parse it.
// Only process if page is in one of our namespaces we want to link
if ( !\MagicWord::get( 'MAG_LINKTITLES_NOAUTOLINKS' )->match( $text ) &&
in_array( $title->getNamespace(), $config->nameSpaces ) ) {
$linker = new Linker( $config );
$text = $linker->linkContent( $title, $text );
}
return true;
}
/*
* Automatically processes a single page, given a $title Title object.
* This function is called by the SpecialLinkTitles class and the
* LinkTitlesJob class.
* @param Title $title Title object.
* @param RequestContext $context Current request context. If in doubt, call MediaWiki's `RequestContext::getMain()` to obtain such an object.
* @returns bool True if the page exists, false if the page does not exist
*/
public static function processPage( \Title $title, \RequestContext $context ) {
$page = \WikiPage::factory($title);
$content = $page->getContent();
if ( $content != null ) {
$text = $content->getContentHandler()->serializeContent($content);
$config = new Config();
$linker = new Linker( $config );
$newText = $linker->linkContent($title, $text);
if ( $text != $newText ) {
$content = $content->getContentHandler()->unserializeContent( $newText );
$page->doEditContent(
$content,
"Links to existing pages added by LinkTitles bot.", // TODO: i18n
EDIT_MINOR | EDIT_FORCE_BOT,
false, // baseRevId
$context->getUser()
);
};
return true;
}
else {
return false;
}
}
/// Adds the two magic words defined by this extension to the list of
/// 'double-underscore' terms that are automatically removed before a
/// page is displayed.
/// @param $doubleUnderscoreIDs Array of magic word IDs.
/// @return true
public static function onGetDoubleUnderscoreIDs( array &$doubleUnderscoreIDs ) {
$doubleUnderscoreIDs[] = 'MAG_LINKTITLES_NOTARGET';
$doubleUnderscoreIDs[] = 'MAG_LINKTITLES_NOAUTOLINKS';
return true;
}
public static function onParserFirstCallInit( \Parser $parser ) {
$parser->setHook( 'noautolinks', 'LinkTitles\Extension::doNoautolinksTag' );
$parser->setHook( 'autolinks', 'LinkTitles\Extension::doAutolinksTag' );
}
/// Removes the extra tag that this extension provides (<noautolinks>)
/// by simply returning the text between the tags (if any).
/// See https://www.mediawiki.org/wiki/Manual:Tag_extensions#Example
public static function doNoautolinksTag( $input, array $args, \Parser $parser, \PPFrame $frame ) {
return htmlspecialchars( $input );
}
/// Removes the extra tag that this extension provides (<noautolinks>)
/// by simply returning the text between the tags (if any).
/// See https://www.mediawiki.org/wiki/Manual:Tag_extensions#How_do_I_render_wikitext_in_my_extension.3F
public static function doAutolinksTag( $input, array $args, \Parser $parser, \PPFrame $frame ) {
$config = new Config();
$linker = new Linker( $config );
$title = $parser->getTitle();
$withLinks = $linker->linkContent( $title, $input );
$output = $parser->recursiveTagParse( $withLinks, $frame );
return $output;
}
}
// vim: ts=2:sw=2:noet:comments^=\:///

View File

@ -1,527 +0,0 @@
<?php
/*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*/
/// @file
namespace LinkTitles;
/// Helper function for development and debugging.
/// @param $var Any variable. Raw content will be dumped to stderr.
/// @return undefined
function dump($var) {
error_log(print_r($var, TRUE) . "\n", 3, 'php://stderr');
};
/// Central class of the extension. Sets up parser hooks.
/// This class contains only static functions; do not instantiate.
class Extension {
/// Caching variable for page titles that are fetched from the DB.
private static $pageTitles;
/// Caching variable for the current namespace.
/// This is needed because the sort order of the page titles that
/// are cached in self::$pageTitles depends on the namespace of
/// the page currently being processed.
private static $currentNamespace;
/// A Title object for the page that is being parsed.
private static $currentTitle;
/// A Title object for the target page currently being examined.
private static $targetTitle;
// The TitleValue object of the target page
private static $targetTitleValue;
/// The content object for the currently processed target page.
/// This variable is necessary to be able to prevent loading the target
/// content twice.
private static $targetContent;
/// Holds the page title of the currently processed target page
/// as a string.
private static $targetTitleText;
/// Delimiter used in a regexp split operation to seperate those parts
/// of the page that should be parsed from those that should not be
/// parsed (e.g. inside pre-existing links etc.).
private static $delimiter;
private static $wordStartDelim;
private static $wordEndDelim;
public static $ltConsoleOutput;
public static $ltConsoleOutputDebug;
/// Setup method
public static function setup() {
self::BuildDelimiters();
}
/// Event handler that is hooked to the PageContentSave event.
public static function onPageContentSave( &$wikiPage, &$user, &$content, &$summary,
$isMinor, $isWatch, $section, &$flags, &$status ) {
global $wgLinkTitlesParseOnEdit;
global $wgLinkTitlesNamespaces;
if ( !$wgLinkTitlesParseOnEdit ) return true;
if ( !$isMinor ) {
$title = $wikiPage->getTitle();
// Only process if page is in one of our namespaces we want to link
// Fixes ugly autolinking of sidebar pages
if ( in_array( $title->getNamespace(), $wgLinkTitlesNamespaces )) {
$text = $content->getContentHandler()->serializeContent( $content );
if ( !\MagicWord::get( 'MAG_LINKTITLES_NOAUTOLINKS' )->match( $text ) ) {
$newText = self::parseContent( $title, $text );
if ( $newText != $text ) {
$content = $content->getContentHandler()->unserializeContent( $newText );
}
}
}
};
return true;
}
/// Event handler that is hooked to the InternalParseBeforeLinks event.
/// @param Parser $parser Parser that raised the event.
/// @param $text Preprocessed text of the page.
public static function onInternalParseBeforeLinks( \Parser &$parser, &$text ) {
global $wgLinkTitlesParseOnRender;
if (!$wgLinkTitlesParseOnRender) return true;
global $wgLinkTitlesNamespaces;
$title = $parser->getTitle();
// If the page contains the magic word '__NOAUTOLINKS__', do not parse it.
// Only process if page is in one of our namespaces we want to link
if ( !\MagicWord::get( 'MAG_LINKTITLES_NOAUTOLINKS' )->match( $text ) &&
in_array( $title->getNamespace(), $wgLinkTitlesNamespaces ) ) {
$text = self::parseContent( $title, $text );
}
return true;
}
/// Core function of the extension, performs the actual parsing of the content.
/// @param Parser $parser Parser instance for the current page
/// @param $text String that holds the article content
/// @returns string: parsed text with links added if needed
private static function parseContent( $title, &$text ) {
// Configuration variables need to be defined here as globals.
global $wgLinkTitlesFirstOnly;
global $wgLinkTitlesSmartMode;
global $wgCapitalLinks;
( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1;
$limitReached = false;
self::$currentTitle = $title;
$currentNamespace = $title->getNamespace();
$newText = $text;
if ( !isset( self::$pageTitles ) || ( $currentNamespace != self::$currentNamespace ) ) {
self::$currentNamespace = $currentNamespace;
self::$pageTitles = self::fetchPageTitles( $currentNamespace );
}
// Iterate through the page titles
foreach( self::$pageTitles as $row ) {
self::newTarget( $row->page_namespace, $row->page_title );
// Don't link current page
if ( self::$targetTitle->equals( self::$currentTitle ) ) { continue; }
// split the page content by [[...]] groups
// credits to inhan @ StackOverflow for suggesting preg_split
// see http://stackoverflow.com/questions/10672286
$arr = preg_split( self::$delimiter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE );
// Escape certain special characters in the page title to prevent
// regexp compilation errors
self::$targetTitleText = self::$targetTitle->getText();
$quotedTitle = preg_quote( self::$targetTitleText, '/' );
self::ltDebugLog( 'TargetTitle='. self::$targetTitleText, 'private' );
self::ltDebugLog( 'TargetTitleQuoted='. $quotedTitle, 'private' );
// Depending on the global configuration setting $wgCapitalLinks,
// the title has to be searched for either in a strictly case-sensitive
// way, or in a 'fuzzy' way where the first letter of the title may
// be either case.
if ( $wgCapitalLinks && ( $quotedTitle[0] != '\\' )) {
$searchTerm = '((?i)' . $quotedTitle[0] . '(?-i)' .
substr($quotedTitle, 1) . ')';
} else {
$searchTerm = '(' . $quotedTitle . ')';
}
$regex = '/(?<![\:\.\@\/\?\&])' . self::$wordStartDelim .
$searchTerm . self::$wordEndDelim . '/S';
for ( $i = 0; $i < count( $arr ); $i+=2 ) {
// even indexes will point to text that is not enclosed by brackets
$arr[$i] = preg_replace_callback( $regex,
'LinkTitles\Extension::simpleModeCallback', $arr[$i], $limit, $count );
if ( $wgLinkTitlesFirstOnly && ( $count > 0 ) ) {
$limitReached = true;
break;
};
};
$newText = implode( '', $arr );
// If smart mode is turned on, the extension will perform a second
// pass on the page and add links with aliases where the case does
// not match.
if ( $wgLinkTitlesSmartMode && !$limitReached ) {
$arr = preg_split( self::$delimiter, $newText, -1, PREG_SPLIT_DELIM_CAPTURE );
for ( $i = 0; $i < count( $arr ); $i+=2 ) {
// even indexes will point to text that is not enclosed by brackets
$arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' .
self::$wordStartDelim . '(' . $quotedTitle . ')' .
self::$wordEndDelim . '/iS', 'LinkTitles\Extension::smartModeCallback',
$arr[$i], $limit, $count );
if ( $wgLinkTitlesFirstOnly && ( $count > 0 )) {
break;
};
};
$newText = implode( '', $arr );
} // $wgLinkTitlesSmartMode
}; // foreach $res as $row
return $newText;
}
/// Automatically processes a single page, given a $title Title object.
/// This function is called by the SpecialLinkTitles class and the
/// LinkTitlesJob class.
/// @param Title $title Title object.
/// @param RequestContext $context Current request context.
/// If in doubt, call MediaWiki's `RequestContext::getMain()`
/// to obtain such an object.
/// @returns boolean True if the page exists, false if the page does not exist
public static function processPage( \Title $title, \RequestContext $context ) {
self::ltLog('Processing '. $title->getPrefixedText());
$page = \WikiPage::factory($title);
$content = $page->getContent();
if ( $content != null ) {
$text = $content->getContentHandler()->serializeContent($content);
$newText = self::parseContent($title, $text);
if ( $text != $newText ) {
$content = $content->getContentHandler()->unserializeContent( $newText );
$page->doEditContent(
$content,
"Links to existing pages added by LinkTitles bot.", // TODO: i18n
EDIT_MINOR | EDIT_FORCE_BOT,
false, // baseRevId
$context->getUser()
);
};
return true;
}
else {
return false;
}
}
/// Adds the two magic words defined by this extension to the list of
/// 'double-underscore' terms that are automatically removed before a
/// page is displayed.
/// @param $doubleUnderscoreIDs Array of magic word IDs.
/// @return true
public static function onGetDoubleUnderscoreIDs( array &$doubleUnderscoreIDs ) {
$doubleUnderscoreIDs[] = 'MAG_LINKTITLES_NOTARGET';
$doubleUnderscoreIDs[] = 'MAG_LINKTITLES_NOAUTOLINKS';
return true;
}
public static function onParserFirstCallInit( \Parser $parser ) {
$parser->setHook( 'noautolinks', 'LinkTitles\Extension::doNoautolinksTag' );
$parser->setHook( 'autolinks', 'LinkTitles\Extension::doAutolinksTag' );
}
/// Removes the extra tag that this extension provides (<noautolinks>)
/// by simply returning the text between the tags (if any).
/// See https://www.mediawiki.org/wiki/Manual:Tag_extensions#Example
public static function doNoautolinksTag( $input, array $args, \Parser $parser, \PPFrame $frame ) {
return htmlspecialchars( $input );
}
/// Removes the extra tag that this extension provides (<noautolinks>)
/// by simply returning the text between the tags (if any).
/// See https://www.mediawiki.org/wiki/Manual:Tag_extensions#How_do_I_render_wikitext_in_my_extension.3F
public static function doAutolinksTag( $input, array $args, \Parser $parser, \PPFrame $frame ) {
$withLinks = self::parseContent( $parser->getTitle(), $input );
$output = $parser->recursiveTagParse( $withLinks, $frame );
return $output;
}
// Fetches the page titles from the database.
// @param $currentNamespace String holding the namespace of the page currently being processed.
private static function fetchPageTitles( $currentNamespace ) {
global $wgLinkTitlesPreferShortTitles;
global $wgLinkTitlesMinimumTitleLength;
global $wgLinkTitlesBlackList;
global $wgLinkTitlesNamespaces;
( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC';
// Build a blacklist of pages that are not supposed to be link
// targets. This includes the current page.
$blackList = str_replace( ' ', '_', '("' . implode( '","',$wgLinkTitlesBlackList ) . '")' );
// Build our weight list. Make sure current namespace is first element
$namespaces = array_diff( $wgLinkTitlesNamespaces, [ $currentNamespace ] );
array_unshift( $namespaces, $currentNamespace );
// No need for sanitiy check. we are sure that we have at least one element in the array
$weightSelect = "CASE page_namespace ";
$currentWeight = 0;
foreach ($namespaces as &$namspacevalue) {
$currentWeight = $currentWeight + 100;
$weightSelect = $weightSelect . " WHEN " . $namspacevalue . " THEN " . $currentWeight . PHP_EOL;
}
$weightSelect = $weightSelect . " END ";
$namespacesClause = '(' . implode( ', ', $namespaces ) . ')';
// Build an SQL query and fetch all page titles ordered by length from
// shortest to longest. Only titles from 'normal' pages (namespace uid
// = 0) are returned. Since the db may be sqlite, we need a try..catch
// structure because sqlite does not support the CHAR_LENGTH function.
$dbr = wfGetDB( DB_SLAVE );
try {
$res = $dbr->select(
'page',
array( 'page_title', 'page_namespace' , "weight" => $weightSelect),
array(
'page_namespace IN ' . $namespacesClause,
'CHAR_LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength,
'page_title NOT IN ' . $blackList,
),
__METHOD__,
array( 'ORDER BY' => 'weight ASC, CHAR_LENGTH(page_title) ' . $sort_order )
);
} catch (Exception $e) {
$res = $dbr->select(
'page',
array( 'page_title', 'page_namespace' , "weight" => $weightSelect ),
array(
'page_namespace IN ' . $namespacesClause,
'LENGTH(page_title) >= ' . $wgLinkTitlesMinimumTitleLength,
'page_title NOT IN ' . $blackList,
),
__METHOD__,
array( 'ORDER BY' => 'weight ASC, LENGTH(page_title) ' . $sort_order )
);
}
return $res;
}
// Build an anonymous callback function to be used in simple mode.
private static function simpleModeCallback( array $matches ) {
if ( self::checkTargetPage() ) {
self::ltLog( "Linking '$matches[0]' to '" . self::$targetTitle . "'" );
return '[[' . $matches[0] . ']]';
}
else
{
return $matches[0];
}
}
// Callback function for use with preg_replace_callback.
// This essentially performs a case-sensitive comparison of the
// current page title and the occurrence found on the page; if
// the cases do not match, it builds an aliased (piped) link.
// If $wgCapitalLinks is set to true, the case of the first
// letter is ignored by MediaWiki and we don't need to build a
// piped link if only the case of the first letter is different.
private static function smartModeCallback( array $matches ) {
global $wgCapitalLinks;
if ( $wgCapitalLinks ) {
// With $wgCapitalLinks set to true we have a slightly more
// complicated version of the callback than if it were false;
// we need to ignore the first letter of the page titles, as
// it does not matter for linking.
if ( self::checkTargetPage() ) {
self::ltLog( "Linking (smart) '$matches[0]' to '" . self::$targetTitle . "'" );
if ( strcmp(substr(self::$targetTitleText, 1), substr($matches[0], 1)) == 0 ) {
// Case-sensitive match: no need to bulid piped link.
return '[[' . $matches[0] . ']]';
} else {
// Case-insensitive match: build piped link.
return '[[' . self::$targetTitleText . '|' . $matches[0] . ']]';
}
}
else
{
return $matches[0];
}
} else {
// If $wgCapitalLinks is false, we can use the simple variant
// of the callback function.
if ( self::checkTargetPage() ) {
self::ltLog( "Linking (smart) '$matches[0]' to '" . self::$targetTitle . "'" );
if ( strcmp(self::$targetTitleText, $matches[0]) == 0 ) {
// Case-sensitive match: no need to bulid piped link.
return '[[' . $matches[0] . ']]';
} else {
// Case-insensitive match: build piped link.
return '[[' . self::$targetTitleText . '|' . $matches[0] . ']]';
}
}
else
{
return $matches[0];
}
}
}
/// Sets member variables for the current target page.
private static function newTarget( $ns, $title ) {
self::$targetTitle = \Title::makeTitleSafe( $ns, $title );
self::ltDebugLog( 'newtarget='. self::$targetTitle->getText(), "private" );
self::$targetTitleValue = self::$targetTitle->getTitleValue();
self::ltDebugLog( 'altTarget='. self::$targetTitleValue->getText(), "private" );
self::$targetContent = null;
}
/// Returns the content of the current target page.
/// This function serves to be used in preg_replace_callback callback
/// functions, in order to load the target page content from the
/// database only when needed.
/// @note It is absolutely necessary that the newTarget()
/// function is called for every new page.
private static function getTargetContent() {
if ( ! isset( $targetContent ) ) {
self::$targetContent = \WikiPage::factory(
self::$targetTitle)->getContent();
};
return self::$targetContent;
}
/// Examines the current target page. Returns true if it may be linked;
/// false if not. This depends on the settings
/// $wgLinkTitlesCheckRedirect and $wgLinkTitlesEnableNoTargetMagicWord
/// and whether the target page is a redirect or contains the
/// __NOAUTOLINKTARGET__ magic word.
/// @returns boolean
private static function checkTargetPage() {
global $wgLinkTitlesEnableNoTargetMagicWord;
global $wgLinkTitlesCheckRedirect;
// If checking for redirects is enabled and the target page does
// indeed redirect to the current page, return the page title as-is
// (unlinked).
if ( $wgLinkTitlesCheckRedirect ) {
$redirectTitle = self::getTargetContent()->getUltimateRedirectTarget();
if ( $redirectTitle && $redirectTitle->equals(self::$currentTitle) ) {
return false;
}
};
// If the magic word __NOAUTOLINKTARGET__ is enabled and the target
// page does indeed contain this magic word, return the page title
// as-is (unlinked).
if ( $wgLinkTitlesEnableNoTargetMagicWord ) {
if ( self::getTargetContent()->matchMagicWord(
\MagicWord::get('MAG_LINKTITLES_NOTARGET') ) ) {
return false;
}
};
return true;
}
/// Builds the delimiter that is used in a regexp to separate
/// text that should be parsed from text that should not be
/// parsed (e.g. inside existing links etc.)
private static function BuildDelimiters() {
// Configuration variables need to be defined here as globals.
global $wgLinkTitlesParseHeadings;
global $wgLinkTitlesSkipTemplates;
global $wgLinkTitlesWordStartOnly;
global $wgLinkTitlesWordEndOnly;
// Use unicode character properties rather than \b escape sequences
// to detect whole words containing non-ASCII characters as well.
// Note that this requires a PCRE library that was compiled with
// --enable-unicode-properties
( $wgLinkTitlesWordStartOnly ) ? self::$wordStartDelim = '(?<!\pL)' : self::$wordStartDelim = '';
( $wgLinkTitlesWordEndOnly ) ? self::$wordEndDelim = '(?!\pL)' : self::$wordEndDelim = '';
if ( $wgLinkTitlesSkipTemplates )
{
// Use recursive regex to balance curly braces;
// see http://www.regular-expressions.info/recurse.html
$templatesDelimiter = '{{(?>[^{}]|(?R))*}}|';
} else {
// Match template names (ignoring any piped [[]] links in them)
// along with the trailing pipe and parameter name or closing
// braces; also match sequences of '|wordcharacters=' (without
// spaces in them) that usually only occur as parameter names in
// transclusions (but could also occur as wiki table cell contents).
// TODO: Find a way to match parameter names in transclusions, but
// not in table cells or other sequences involving a pipe character
// and equal sign.
$templatesDelimiter = '{{[^|]*?(?:(?:\[\[[^]]+]])?)[^|]*?(?:\|(?:\w+=)?|(?:}}))|\|\w+=|';
}
// Build a regular expression that will capture existing wiki links ("[[...]]"),
// wiki headings ("= ... =", "== ... ==" etc.),
// urls ("http://example.com", "[http://example.com]", "[http://example.com Description]",
// and email addresses ("mail@example.com").
// Since there is a user option to skip headings, we make this part of the expression
// optional. Note that in order to use preg_split(), it is important to have only one
// capturing subpattern (which precludes the use of conditional subpatterns).
( $wgLinkTitlesParseHeadings ) ? $delimiter = '' : $delimiter = '=+.+?=+|';
$urlPattern = '[a-z]+?\:\/\/(?:\S+\.)+\S+(?:\/.*)?';
self::$delimiter = '/(' . // exclude from linking:
'\[\[.*?\]\]|' . // links
$delimiter . // titles (if requested)
$templatesDelimiter . // templates (if requested)
'^ .+?\n|\n .+?\n|\n .+?$|^ .+?$|' . // preformatted text
'<nowiki>.*?<.nowiki>|<code>.*?<\/code>|' . // nowiki/code
'<pre>.*?<\/pre>|<html>.*?<\/html>|' . // pre/html
'<script>.*?<\/script>|' . // script
'<gallery>.*?<\/gallery>|' . // gallery
'<div.+?>|<\/div>|' . // attributes of div elements
'<span.+?>|<\/span>|' . // attributes of span elements
'<file>[^<]*<\/file>|' . // stuff inside file elements
'style=".+?"|class=".+?"|' . // styles and classes (e.g. of wikitables)
'<noautolinks>.*?<\/noautolinks>|' . // custom tag 'noautolinks'
'\[' . $urlPattern . '\s.+?\]|'. $urlPattern . '(?=\s|$)|' . // urls
'(?<=\b)\S+\@(?:\S+\.)+\S+(?=\b)' . // email addresses
')/ismS';
}
/// Local Debugging output function which can send output to console as well
public static function ltDebugLog($text) {
if ( self::$ltConsoleOutputDebug ) {
print $text . "\n";
}
wfDebugLog( 'LinkTitles', $text , 'private' );
}
/// Local Logging output function which can send output to console as well
public static function ltLog($text) {
if (self::$ltConsoleOutput) {
print $text . "\n";
}
wfDebugLog( 'LinkTitles', $text , 'private' );
}
}
// vim: ts=2:sw=2:noet:comments^=\:///

184
includes/Linker.php Normal file
View File

@ -0,0 +1,184 @@
<?php
/**
* The LinkTitles\Linker class does the heavy linking for the extension.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/**
* Performs the actual linking of content to existing pages.
*/
class Linker {
/**
* LinkTitles configuration.
*
* @var Config $config
*/
public $config;
/**
* The string representation of the title object for the potential target page
* that is currently being processed.
*
* This is an instance variable (rather than a local method variable) so it
* can be accessed in the preg_replace_callback callbacks.
*
* @var String $targetTitleString
*/
private $targetTitleText;
/**
* Constructs a new instance of the Linker class.
*
* @param Config $config LinkTitles configuration object.
*/
public function __construct( Config &$config ) {
$this->config = $config;
}
/**
* Core function of the extension, performs the actual parsing of the content.
*
* This method receives a Title object and the string representation of the
* source page. It does not work on a WikiPage object directly because the
* callbacks in the Extension class do not always get a WikiPage object in the
* first place.
*
* @param \Title &$title Title object for the current page.
* @param String $text String that holds the article content
* @return String with links to target pages
*/
public function linkContent( \Title &$title, &$text ) {
( $this->config->firstOnly ) ? $limit = 1 : $limit = -1;
$limitReached = false;
$newText = $text;
$splitter = Splitter::default( $this->config );
$targets = Targets::default( $title, $this->config );
// Iterate through the target page titles
foreach( $targets->queryResult as $row ) {
$target = new Target( $row->page_namespace, $row->page_title, $this->config );
// Don't link current page and don't link if the target page redirects
// to the current page or has the __NOAUTOLINKTARGET__ magic word
// (as required by the actual LinkTitles configuration).
if ( $target->isSameTitle( $title ) || !$target->mayLinkTo( $title ) ) {
continue;
}
// Split the page content by non-linkable sections.
// Credits to inhan @ StackOverflow for suggesting preg_split.
// See http://stackoverflow.com/questions/10672286
$arr = $splitter->split( $newText );
$count = 0;
// Cache the target title text for the regex callbacks
$this->targetTitleText = $target->getTitleText();
// Even indexes will point to sections of the text that may be linked
for ( $i = 0; $i < count( $arr ); $i += 2 ) {
$arr[$i] = preg_replace_callback( $target->getCaseSensitiveRegex(),
array( $this, 'simpleModeCallback'),
$arr[$i], $limit, $count );
if ( $this->config->firstOnly && ( $count > 0 ) ) {
$limitReached = true;
break;
};
};
$newText = implode( '', $arr );
// If smart mode is turned on, the extension will perform a second
// pass on the page and add links with aliases where the case does
// not match.
if ( $this->config->smartMode && !$limitReached ) {
if ( $count > 0 ) {
// Split the text again because it was changed in the first pass.
$arr = $splitter->split( $newText );
}
for ( $i = 0; $i < count( $arr ); $i+=2 ) {
// even indexes will point to text that is not enclosed by brackets
$arr[$i] = preg_replace_callback( $target->getCaseInsensitiveRegex(),
array( $this, 'smartModeCallback'),
$arr[$i], $limit, $count );
if ( $this->config->firstOnly && ( $count > 0 )) {
break;
};
};
$newText = implode( '', $arr );
} // $wgLinkTitlesSmartMode
}; // foreach $res as $row
return $newText;
}
/**
* Callback for preg_replace_callback in simple mode.
*
* @param array $matches Matches provided by preg_replace_callback
* @return string Target page title with or without link markup
*/
private function simpleModeCallback( array $matches ) {
return '[[' . $matches[0] . ']]';
}
/**
* Callback function for use with preg_replace_callback.
* This essentially performs a case-sensitive comparison of the
* current page title and the occurrence found on the page; if
* the cases do not match, it builds an aliased (piped) link.
* If $wgCapitalLinks is set to true, the case of the first
* letter is ignored by MediaWiki and we don't need to build a
* piped link if only the case of the first letter is different.
*
* @param array $matches Matches provided by preg_replace_callback
* @return string Target page title with or without link markup
*/
private function smartModeCallback( array $matches ) {
if ( $this->config->capitalLinks ) {
// With $wgCapitalLinks set to true we have a slightly more
// complicated version of the callback than if it were false;
// we need to ignore the first letter of the page titles, as
// it does not matter for linking.
if ( strcmp( substr( $this->targetTitleText, 1 ), substr( $matches[ 0 ], 1) ) == 0 ) {
// Case-sensitive match: no need to bulid piped link.
return '[[' . $matches[ 0 ] . ']]';
} else {
// Case-insensitive match: build piped link.
return '[[' . $this->targetTitleText . '|' . $matches[ 0 ] . ']]';
}
} else {
// If $wgCapitalLinks is false, we can use the simple variant
// of the callback function.
if ( strcmp( $this->targetTitleText, $matches[ 0 ] ) == 0 ) {
// Case-sensitive match: no need to bulid piped link.
return '[[' . $matches[ 0 ] . ']]';
} else {
// Case-insensitive match: build piped link.
return '[[' . $this->targetTitleText . '|' . $matches[ 0 ] . ']]';
}
}
}
}
// vim: ts=2:sw=2:noet:comments^=\:///

View File

@ -1,5 +1,7 @@
<?php
/*
/**
* Provides a special page for the LinkTitles extension.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
@ -16,6 +18,8 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/// @defgroup batch Batch processing
@ -26,13 +30,17 @@ if ( !defined( 'MEDIAWIKI' ) ) {
}
/// @endcond
/// Provides a special page that can be used to batch-process all pages in
/// the wiki. By default, this can only be performed by sysops.
/// @ingroup batch
/**
* Provides a special page that can be used to batch-process all pages in
* the wiki. By default, this can only be performed by sysops.
* @ingroup batch
*
*/
class Special extends \SpecialPage {
/// Constructor. Announces the special page title and required user right
/// to the parent constructor.
/**
* Constructor. Announces the special page title and required user right to the parent constructor.
*/
function __construct() {
// the second parameter in the following function call ensures that only
// users who have the 'linktitles-batch' right get to see this page (by
@ -44,9 +52,11 @@ class Special extends \SpecialPage {
return 'pagetools';
}
/// Entry function of the special page class. Will abort if the user does
/// not have appropriate permissions ('linktitles-batch').
/// @return undefined
/**
* Entry function of the special page class. Will abort if the user does not have appropriate permissions ('linktitles-batch').
* @param $par Additional parameters (required by interface; currently not used)
*/
function execute($par) {
// Prevent non-authorized users from executing the batch processing.
if ( !$this->userCanExecute( $this->getUser() ) ) {
@ -76,12 +86,13 @@ class Special extends \SpecialPage {
}
}
/// Processes wiki articles, starting at the page indicated by
/// $startTitle. If $wgLinkTitlesTimeLimit is reached before all pages are
/// processed, returns the title of the next page that needs processing.
/// @param WebRequest $request WebRequest object that is associated with the special
/// page.
/// @param OutputPage $output Output page for the special page.
/**
* Processes wiki articles, starting at the page indicated by
* $startTitle. If $wgLinkTitlesTimeLimit is reached before all pages are
* processed, returns the title of the next page that needs processing.
* @param WebRequest $request WebRequest object that is associated with the special page.
* @param OutputPage $output Output page for the special page.
*/
private function process( \WebRequest &$request, \OutputPage &$output) {
global $wgLinkTitlesTimeLimit;
global $wgLinkTitlesNamespaces;
@ -113,9 +124,7 @@ class Special extends \SpecialPage {
$end = $this->countPages($dbr, $namespacesClause );
};
array_key_exists('r', $postValues) ?
$reloads = $postValues['r'] :
$reloads = 0;
array_key_exists('r', $postValues) ? $reloads = $postValues['r'] : $reloads = 0;
// Retrieve page names from the database.
$res = $dbr->select(
@ -162,8 +171,10 @@ class Special extends \SpecialPage {
}
}
/// Adds WikiText to the output containing information about the extension
/// and a form and button to start linking.
/*
* Adds WikiText to the output containing information about the extension
* and a form and button to start linking.
*/
private function buildInfoPage( &$request, &$output ) {
$url = $request->getRequestURL();
@ -192,12 +203,13 @@ EOF
);
}
/// Produces informative output in WikiText format to show while working.
/// @param $output Output object.
/// @param $curTitle Title of the currently processed page.
/// @param $index Index of the currently processed page.
/// @param $end Last index that will be processed (i.e., number of
/// pages).
/*
* Produces informative output in WikiText format to show while working.
* @param $output Output object.
* @param $curTitle Title of the currently processed page.
* @param $index Index of the currently processed page.
* @param $end Last index that will be processed (i.e., number of pages).
*/
private function addProgressInfo( &$output, $curTitle, $index, $end ) {
$progress = $index / $end * 100;
$percent = sprintf("%01.1f", $progress);
@ -232,14 +244,15 @@ EOF
);
}
/// Generates an HTML form and JavaScript to automatically submit the
/// form.
/// @param $url URL to reload with a POST request.
/// @param $start Index of the next page that shall be processed.
/// @param $end Index of the last page to be processed.
/// @param $reloads Counter that holds the number of reloads so far.
/// @returns String that holds the HTML for a form and a
/// JavaScript command.
/*
* Generates an HTML form and JavaScript to automatically submit the
* form.
* @param $url URL to reload with a POST request.
* @param $start Index of the next page that shall be processed.
* @param $end Index of the last page to be processed.
* @param $reloads Counter that holds the number of reloads so far.
* @returns String that holds the HTML for a form and a JavaScript command.
*/
private function getReloaderForm( $url, $start, $end, $reloads ) {
return
<<<EOF
@ -255,12 +268,14 @@ EOF
;
}
/// Adds statistics to the page when all processing is done.
/// @param $output Output object
/// @param $start Index of the first page that was processed.
/// @param $end Index of the last processed page.
/// @param $reloads Number of reloads of the page.
/// @returns undefined
/*
* Adds statistics to the page when all processing is done.
* @param $output Output object
* @param $start Index of the first page that was processed.
* @param $end Index of the last processed page.
* @param $reloads Number of reloads of the page.
* @returns undefined
*/
private function addCompletedInfo( &$output, $start, $end, $reloads ) {
global $wgLinkTitlesTimeLimit;
$pagesPerReload = sprintf('%0.1f', $end / $reloads);
@ -281,9 +296,11 @@ EOF
);
}
/// Counts the number of pages in a read-access wiki database ($dbr).
/// @param $dbr Read-only `Database` object.
/// @returns Number of pages in the default namespace (0) of the wiki.
/*
* Counts the number of pages in a read-access wiki database ($dbr).
* @param $dbr Read-only `Database` object.
* @returns Number of pages in the default namespace (0) of the wiki.
*/
private function countPages(&$dbr, $namespacesClause) {
$res = $dbr->select(
'page',

140
includes/Splitter.php Normal file
View File

@ -0,0 +1,140 @@
<?php
/**
* The Splitter class caches a regular expression that delimits text to be parsed.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/**
* Caches a regular expression that delimits text to be parsed.
*/
class Splitter {
/**
* The splitting expression that separates text to be parsed from text that
* must not be parsed.
* @var String $splitter
*/
public $splitter;
/**
* The LinkTitles configuration for this Splitter instance.
* @var Config $config
*/
public $config;
private static $instance;
/**
* Gets the Splitter singleton; may build one with the given config or the
* default config if none is given.
*
* If the instance was already created, it does not matter what Config this
* method is called with. To re-create an instance with a different Config,
* call Splitter::invalidate() first.
*
* @param Config|null $config LinkTitles configuration.
*/
public static function default( Config &$config = null ) {
if ( self::$instance === null ) {
if ( $config === null ) {
$config = new Config();
}
self::$instance = new Splitter( $config );
}
return self::$instance;
}
/**
* Invalidates the singleton instance.
*
* Used for unit testing.
*/
public static function invalidate() {
self::$instance = null;
}
protected function __construct( Config $config) {
$this->config = $config;
$this->buildSplitter();
}
/**
* Splits a text into sections that may be linked and sections that may not
* be linked (e.g., because they already are a link, or a template, etc.).
*
* @param String &$text Text to split.
* @return Array of strings where even indexes point to linkable sections.
*/
public function split( &$text ) {
return preg_split( $this->splitter, $text, -1, PREG_SPLIT_DELIM_CAPTURE );
}
/*
* Builds the delimiter that is used in a regexp to separate
* text that should be parsed from text that should not be
* parsed (e.g. inside existing links etc.)
*/
private function buildSplitter() {
if ( $this->config->skipTemplates )
{
// Use recursive regex to balance curly braces;
// see http://www.regular-expressions.info/recurse.html
$templatesDelimiter = '{{(?>[^{}]|(?R))*}}|';
} else {
// Match template names (ignoring any piped [[]] links in them)
// along with the trailing pipe and parameter name or closing
// braces; also match sequences of '|wordcharacters=' (without
// spaces in them) that usually only occur as parameter names in
// transclusions (but could also occur as wiki table cell contents).
// TODO: Find a way to match parameter names in transclusions, but
// not in table cells or other sequences involving a pipe character
// and equal sign.
$templatesDelimiter = '{{[^|]*?(?:(?:\[\[[^]]+]])?)[^|]*?(?:\|(?:\w+=)?|(?:}}))|\|\w+=|';
}
// Build a regular expression that will capture existing wiki links ("[[...]]"),
// wiki headings ("= ... =", "== ... ==" etc.),
// urls ("http://example.com", "[http://example.com]", "[http://example.com Description]",
// and email addresses ("mail@example.com").
// Since there is a user option to skip headings, we make this part of the expression
// optional. Note that in order to use preg_split(), it is important to have only one
// capturing subpattern (which precludes the use of conditional subpatterns).
( $this->config->parseHeadings ) ? $delimiter = '' : $delimiter = '=+.+?=+|';
$urlPattern = '[a-z]+?\:\/\/(?:\S+\.)+\S+(?:\/.*)?';
$this->splitter = '/(' . // exclude from linking:
'\[\[.*?\]\]|' . // links
$delimiter . // titles (if requested)
$templatesDelimiter . // templates (if requested)
'^ .+?\n|\n .+?\n|\n .+?$|^ .+?$|' . // preformatted text
'<nowiki>.*?<.nowiki>|<code>.*?<\/code>|' . // nowiki/code
'<pre>.*?<\/pre>|<html>.*?<\/html>|' . // pre/html
'<script>.*?<\/script>|' . // script
'<gallery>.*?<\/gallery>|' . // gallery
'<div.+?>|<\/div>|' . // attributes of div elements
'<span.+?>|<\/span>|' . // attributes of span elements
'<file>[^<]*<\/file>|' . // stuff inside file elements
'style=".+?"|class=".+?"|' . // styles and classes (e.g. of wikitables)
'<noautolinks>.*?<\/noautolinks>|' . // custom tag 'noautolinks'
'\[' . $urlPattern . '\s.+?\]|'. $urlPattern . '(?=\s|$)|' . // urls
'(?<=\b)\S+\@(?:\S+\.)+\S+(?=\b)' . // email addresses
')/ismS';
}
}

194
includes/Target.php Normal file
View File

@ -0,0 +1,194 @@
<?php
/**
* The LinkTitles\Target represents a Wiki page that is a potential link target.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/**
* Represents a page that is a potential link target.
*/
class Target {
/**
* A Title object for the target page currently being examined.
* @var \Title $title
*/
private $title;
/**
* Caches the target page content as a \Content object.
*
* @var \Content $content
*/
private $content;
/**
* Regex that matches the start of a word; this expression depends on the
* setting of LinkTitles\Config->wordStartOnly;
* @var String $wordStart
*/
public $wordStart;
/**
* Regex that matches the end of a word; this expression depends on the
* setting of LinkTitles\Config->wordEndOnly;
* @var String $wordEnd
*/
public $wordEnd;
/**
* LinkTitles configuration.
* @var Config $config
*/
private $config;
/**
* Constructs a new Target object
*
* The parameters may be taken from database rows, for example.
*
* @param Int $nameSpace Name space of the target page
* @param String &$title Title of the target page
*/
public function __construct( $nameSpace, $title, Config &$config ) {
$this->title = \Title::makeTitleSafe( $nameSpace, $title );
$this->titleValue = $this->title->getTitleValue();
$this->config = $config;
// Use unicode character properties rather than \b escape sequences
// to detect whole words containing non-ASCII characters as well.
// Note that this requires a PCRE library that was compiled with
// --enable-unicode-properties
( $config->wordStartOnly ) ? $this->wordStart = '(?<!\pL)' : $this->wordStart = '';
( $config->wordEndOnly ) ? $this->wordEnd = '(?!\pL)' : $this->wordEnd = '';
}
/**
* Gets the string representation of the target title.
* @return String title text
*/
public function getTitleText() {
return $this->title->getText();
}
/**
* Gets the title string with certain characters escaped that may interfere
* with regular expressions.
* @return String representation of the title, regex-safe
*/
public function getRegexSafeTitle() {
return preg_quote( $this->title->getText(), '/' );
}
/**
* Builds a regular expression of the title
* @return String regular expression for this title.
*/
public function getCaseSensitiveRegex() {
$regexSafeTitle = $this->getRegexSafeTitle();
// Depending on the $config->capitalLinks setting,
// the title has to be searched for either in a strictly case-sensitive
// way, or in a 'fuzzy' way where the first letter of the title may
// be either case.
//
if ( $this->config->capitalLinks && ( $regexSafeTitle[0] != '\\' )) {
$searchTerm = '((?i)' . $regexSafeTitle[0] . '(?-i)' . substr($regexSafeTitle, 1) . ')';
} else {
$searchTerm = '(' . $regexSafeTitle . ')';
}
return $this->buildRegex( $searchTerm );
}
/**
* Builds a regular expression pattern for the title in a case-insensitive
* way.
* @return String case-insensitive regular expression pattern for the title
*/
public function getCaseInsensitiveRegex() {
return $this->buildRegex( $this->getRegexSafeTitle() ) . 'i';
}
/**
* Builds the basic regex that is used to match target page titles in a source
* text.
* @param String $searchTerm Target page title (special characters must be quoted)
* @return String regular expression pattern
*/
private function buildRegex( $searchTerm ) {
return '/(?<![\:\.\@\/\?\&])' . $this->wordStart . $searchTerm . $this->wordEnd . '/S';
}
/**
* Returns the \Content of the target page.
*
* The value is cached.
* @return \Content Content of the Target page.
*/
public function getContent() {
if ( $this->content === null ) {
$this->content = \WikiPage::factory( $this->title )->getContent();
};
return $this->content;
}
/**
* Examines the current target page. Returns true if it may be linked;
* false if not. This depends on two settings:
* $wgLinkTitlesCheckRedirect and $wgLinkTitlesEnableNoTargetMagicWord
* and whether the target page is a redirect or contains the
* __NOAUTOLINKTARGET__ magic word.
*
* @param \Title $fromTitle
*
* @return boolean
*/
public function mayLinkTo( \Title $fromTitle ) {
// If checking for redirects is enabled and the target page does
// indeed redirect to the current page, return the page title as-is
// (unlinked).
if ( $this->config->checkRedirect ) {
$redirectTitle = $this->getContent()->getUltimateRedirectTarget();
if ( $redirectTitle && $redirectTitle->equals( $fromTitle ) ) {
return false;
}
};
// If the magic word __NOAUTOLINKTARGET__ is enabled and the target
// page does indeed contain this magic word, return the page title
// as-is (unlinked).
if ( $this->config->enableNoTargetMagicWord ) {
if ( $this->getContent()->matchMagicWord( \MagicWord::get('MAG_LINKTITLES_NOTARGET') ) ) {
return false;
}
};
return true;
}
/**
* Determines if the Target's title is the same as another title.
* @param Title $otherTitle Other title
* @return boolean True if the $otherTitle is the same, false if not.
*/
public function isSameTitle( \Title $otherTitle) {
return $this->title->equals( $otherTitle );
}
}

142
includes/Targets.php Normal file
View File

@ -0,0 +1,142 @@
<?php
/**
* The LinkTitles\Targets class.
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> ('bovender')
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
* MA 02110-1301, USA.
*
* @author Daniel Kraus <bovender@bovender.de>
*/
namespace LinkTitles;
/**
* Fetches potential target page titles from the database.
*/
class Targets {
private static $instance;
/**
* Singleton factory that returns a (cached) database query results with
* potential target page titles.
*
* The subset of pages that may serve as target pages depends on the
* name space of the source page. Therefore, if the $nameSpace differs from
* the cached name space, the database is queried again.
*
* @param String $nameSpace The namespace of the current page.
* @param Config $config LinkTitles configuration.
*/
public static function default( \Title $title, Config $config ) {
if ( ( self::$instance === null ) || ( self::$instance->nameSpace != $title->getNamespace() ) ) {
self::$instance = new Targets( $title, $config );
}
return self::$instance;
}
/**
* Invalidates the cache; the next call of Targets::default() will trigger
* a database query.
*
* Use this in unit tests which are performed in a single request cycle so that
* changes to the pages list may not be picked up by the cached Targets instance.
*/
public static function invalidate() {
self::$instance = null;
}
/**
* Holds the results of a database query for target page titles, filtered
* and sorted.
* @var IResultWrapper $queryResult
*/
public $queryResult;
/**
* Holds the name space (integer) for which the list of target pages was built.
* @var Int $nameSpace
*/
public $nameSpace;
private $config;
/**
* The constructor is private to enforce using the singleton pattern.
* @param \Title $title
*/
private function __construct( \Title $title, Config $config) {
$this->config = $config;
$this->nameSpace = $title->getNameSpace();
$this->fetch();
}
//
/**
* Fetches the page titles from the database.
*/
private function fetch() {
( $this->config->preferShortTitles ) ? $sortOrder = 'ASC' : $sortOrder = 'DESC';
// Build a blacklist of pages that are not supposed to be link
// targets. This includes the current page.
$blackList = str_replace( ' ', '_', '("' . implode( '","',$this->config->blackList ) . '")' );
// Build our weight list. Make sure current namespace is first element
$nameSpaces = array_diff( $this->config->nameSpaces, [ $this->nameSpace ] );
array_unshift( $nameSpaces, $this->nameSpace );
// No need for sanitiy check. we are sure that we have at least one element in the array
$weightSelect = "CASE page_namespace ";
$currentWeight = 0;
foreach ($nameSpaces as &$nameSpaceValue) {
$currentWeight = $currentWeight + 100;
$weightSelect = $weightSelect . " WHEN " . $nameSpaceValue . " THEN " . $currentWeight . PHP_EOL;
}
$weightSelect = $weightSelect . " END ";
$nameSpacesClause = '(' . implode( ', ', $nameSpaces ) . ')';
// Build an SQL query and fetch all page titles ordered by length from
// shortest to longest. Only titles from 'normal' pages (namespace uid
// = 0) are returned. Since the db may be sqlite, we need a try..catch
// structure because sqlite does not support the CHAR_LENGTH function.
$dbr = wfGetDB( DB_SLAVE );
try {
$this->queryResult = $dbr->select(
'page',
array( 'page_title', 'page_namespace' , "weight" => $weightSelect),
array(
'page_namespace IN ' . $nameSpacesClause,
'CHAR_LENGTH(page_title) >= ' . $this->config->minimumTitleLength,
'page_title NOT IN ' . $blackList,
),
__METHOD__,
array( 'ORDER BY' => 'weight ASC, CHAR_LENGTH(page_title) ' . $sortOrder )
);
} catch (Exception $e) {
$this->queryResult = $dbr->select(
'page',
array( 'page_title', 'page_namespace' , "weight" => $weightSelect ),
array(
'page_namespace IN ' . $nameSpacesClause,
'LENGTH(page_title) >= ' . $this->config->minimumTitleLength,
'page_title NOT IN ' . $blackList,
),
__METHOD__,
array( 'ORDER BY' => 'weight ASC, LENGTH(page_title) ' . $sortOrder )
);
}
}
}

View File

@ -1,5 +1,7 @@
<?php
/*
/**
* LinkTitles command line interface (CLI)/maintenance script
*
* Copyright 2012-2017 Daniel Kraus <bovender@bovender.de> @bovender
*
* This program is free software; you can redistribute it and/or modify
@ -44,17 +46,21 @@ else
}
};
require_once( __DIR__ . "/includes/LinkTitles_Extension.php" );
require_once( __DIR__ . "/includes/Extension.php" );
/// Core class of the maintanance script.
/// @note Note that the execution of maintenance scripts is prohibited for
/// an Apache web server due to a `.htaccess` file that declares `deny from
/// all`. Other webservers may exhibit different behavior. Be aware that
/// anybody who is able to execute this script may place a high load on the
/// server.
/// @ingroup batch
/**
* Core class of the maintanance script.
* @note Note that the execution of maintenance scripts is prohibited for
* an Apache web server due to a `.htaccess` file that declares `deny from
* all`. Other webservers may exhibit different behavior. Be aware that
* anybody who is able to execute this script may place a high load on the
* server.
* @ingroup batch
*/
class Cli extends \Maintenance {
/// The constructor adds a description and one option.
/**
* Constructor.
*/
public function __construct() {
parent::__construct();
$this->addDescription("Iterates over wiki pages and automatically adds links to other pages.");
@ -72,34 +78,38 @@ class Cli extends \Maintenance {
true, // requires argument
"p"
);
$this->addOption(
"log",
"enables logging to console",
false, // not required
false, // requires no argument
"l"
);
$this->addOption(
"debug",
"enables debug logging to console",
false, // not required
false // requires no argument
);
// TODO: Add back logging options.
// TODO: Add configuration options.
// $this->addOption(
// "log",
// "enables logging to console",
// false, // not required
// false, // requires no argument
// "l"
// );
// $this->addOption(
// "debug",
// "enables debug logging to console",
// false, // not required
// false // requires no argument
// );
}
/// Main function of the maintenance script.
/// Will iterate over all pages in the wiki (starting at a certain index,
/// if the `--start` option is given) and call LinkTitles::processPage() for
/// each page.
/*
* Main function of the maintenance script.
* Will iterate over all pages in the wiki (starting at a certain index,
* if the `--start` option is given) and call LinkTitles::processPage() for
* each page.
*/
public function execute() {
if ($this->hasOption('log'))
{
Extension::$ltConsoleOutput = true;
}
if ($this->hasOption('debug'))
{
Extension::$ltConsoleOutputDebug = true;
}
// if ($this->hasOption('log'))
// {
// Extension::$ltConsoleOutput = true;
// }
// if ($this->hasOption('debug'))
// {
// Extension::$ltConsoleOutputDebug = true;
// }
if ( $this->hasOption('page') ) {
if ( !$this->hasOption( 'start' ) ) {
$this->singlePage();
@ -113,10 +123,14 @@ class Cli extends \Maintenance {
if ( $startIndex < 0 ) {
$this->error( 'FATAL: Start index must be 0 or greater.', 1 );
};
$this->allPages( $startIndex);
$this->allPages( $startIndex );
}
}
/**
* Processes a single page.
* @return bool True on success, false on failure.
*/
private function singlePage() {
$pageName = strval( $this->getOption( 'page' ) );
$this->output( "Processing single page: '$pageName'\n" );
@ -131,17 +145,22 @@ class Cli extends \Maintenance {
return $success;
}
/**
* Process all pages in the Wiki.
* @param integer $index Index of the start page.
* @return bool True on success, false on failure.
*/
private function allPages( $index = 0 ) {
global $wgLinkTitlesNamespaces;
$config = new Config();
// Retrieve page names from the database.
$dbr = $this->getDB( DB_SLAVE );
$namespacesClause = str_replace( '_', ' ','(' . implode( ', ', $wgLinkTitlesNamespaces ) . ')' );
$nameSpacesClause = str_replace( '_', ' ','(' . implode( ', ', $config->nameSpaces ) . ')' );
$res = $dbr->select(
'page',
array( 'page_title', 'page_namespace' ),
array(
'page_namespace IN ' . $namespacesClause,
'page_namespace IN ' . $nameSpacesClause,
),
__METHOD__,
array(

View File

@ -0,0 +1,20 @@
<?php
/**
* Tests the LinkTitles\Config class.
*
* This single unit test basically serves to ensure the Config class is working.
* @group bovender
* @group Database
*/
class ConfigTest extends LinkTitles\TestCase {
public function testParseOnEdit() {
$this->setMwGlobals( [
'wgLinkTitlesParseOnEdit' => true,
'wgLinkTitlesParseOnRender' => false
] );
$config = new LinkTitles\Config();
global $wgLinkTitlesParseOnEdit;
$this->assertSame( $config->parseOnEdit, $wgLinkTitlesParseOnEdit );
}
}

View File

@ -0,0 +1,27 @@
<?php
/**
* @group bovender
* @group Database
*/
class ParseOnEditTest extends LinkTitles\TestCase {
public function testParseOnEdit() {
$this->setMwGlobals( [
'wgLinkTitlesParseOnEdit' => true,
'wgLinkTitlesParseOnRender' => false
] );
$pageId = $this->insertPage( 'test page', 'This page should link to the link target but not to test page' )['id'];
$page = WikiPage::newFromId( $pageId );
$this->assertSame( 'This page should link to the [[link target]] but not to test page', self::getPageText( $page ) );
}
public function testDoNotParseOnEdit() {
$this->setMwGlobals( [
'wgLinkTitlesParseOnEdit' => false,
'wgLinkTitlesParseOnRender' => false
] );
$pageId = $this->insertPage( 'test page', 'This page should not link to the link target' )['id'];
$page = WikiPage::newFromId( $pageId );
$this->assertSame( 'This page should not link to the link target', self::getPageText( $page ) );
}
}

View File

@ -0,0 +1,31 @@
<?php
/**
* @group bovender
*/
class SplitterTest extends MediaWikiTestCase {
/**
* @dataProvider provideSplitData
*/
public function testSplit( $input, $expectedOutput ) {
$splitter = LinkTitles\Splitter::default();
$this->assertSame( $expectedOutput, $splitter->split( $input ) );
}
// TODO: Add more examples.
public static function provideSplitData() {
return [
[
'this may be linked [[this may not be linked]]',
[ 'this may be linked ', '[[this may not be linked]]', '' ]
],
[
'this may be linked <gallery>this may not be linked</gallery>',
[ 'this may be linked ', '<gallery>this may not be linked</gallery>', '' ]
],
[
'this may be linked {{mytemplate|param={{transcluded}}}}',
[ 'this may be linked ', '{{mytemplate|param={{transcluded}}}}', '' ]
],
];
}
}

View File

@ -0,0 +1,40 @@
<?php
/**
* @group bovender
*/
class TargetTest extends MediaWikiTestCase {
/**
* @dataProvider provideStartOnly
*/
public function testTargetWordStartOnly( $enabled, $delimiter ) {
$config = new LinkTitles\Config();
$config->wordStartOnly = $enabled;
$target = new LinKTitles\Target( NS_MAIN, 'test page', $config );
$this->assertSame( $delimiter, $target->wordStart );
}
public static function provideStartOnly() {
return [
[ true, '(?<!\pL)' ],
[ false, '' ]
];
}
/**
* @dataProvider provideEndOnly
*/
public function testTargetWordEndOnly( $enabled, $delimiter ) {
$config = new LinkTitles\Config();
$config->wordEndOnly = $enabled;
$target = new LinKTitles\Target( NS_MAIN, 'test page', $config );
$this->assertSame( $delimiter, $target->wordEnd );
}
public static function provideEndOnly() {
return [
[ true, '(?!\pL)' ],
[ false, '' ]
];
}
}

View File

@ -0,0 +1,26 @@
<?php
/**
* Tests the LinkTitles\Targets class.
*
* @group bovender
* @group Database
*/
class TargetsTest extends LinkTitles\TestCase {
/**
* This test asserts that the list of potential link targets is 0
* @return [type] [description]
*/
public function testTargets() {
$title = \Title::newFromText( 'link target' );
$targets = LinkTitles\Targets::default( $title, new LinkTitles\Config() );
// Count number of articles: Inspired by updateArticleCount.php maintenance
// script: https://doc.wikimedia.org/mediawiki-core/master/php/updateArticleCount_8php_source.html
$dbr = wfGetDB( DB_SLAVE );
$counter = new SiteStatsInit( $dbr );
$count = $counter->pages();
$this->assertEquals( $targets->queryResult->numRows(), $count );
}
}

View File

@ -0,0 +1,19 @@
<?php
namespace LinkTitles;
abstract class TestCase extends \MediaWikiTestCase {
protected function setUp() {
parent::setUp();
$this->insertPage( 'link target', 'This page serves as a link target' );
Targets::invalidate(); // force re-querying the pages table
}
protected function tearDown() {
parent::tearDown();
}
protected function getPageText( \WikiPage $page ) {
$content = $page->getContent();
return $page->getContentHandler()->serializeContent( $content );
}
}