From 4d5554a74ec5e44e6265dffcb13cebb471f63e6b Mon Sep 17 00:00:00 2001 From: Daniel Kraus Date: Sat, 26 Aug 2017 22:01:52 +0200 Subject: [PATCH] Add Delimiters class. --- extension.json | 2 +- includes/Config.php | 45 ++++++++++ includes/Delimiters.php | 138 +++++++++++++++++++++++++++++++ includes/Extension.php | 98 +++------------------- tests/phpunit/DelimitersTest.php | 42 ++++++++++ 5 files changed, 238 insertions(+), 87 deletions(-) create mode 100644 includes/Delimiters.php create mode 100644 tests/phpunit/DelimitersTest.php diff --git a/extension.json b/extension.json index e976596..9fa0cda 100644 --- a/extension.json +++ b/extension.json @@ -35,6 +35,7 @@ "AutoloadClasses": { "LinkTitles\\Extension": "includes/Extension.php", "LinkTitles\\Targets": "includes/Targets.php", + "LinkTitles\\Delimiters": "includes/Delimiters.php", "LinkTitles\\Config": "includes/Config.php", "LinkTitles\\Special": "includes/Special.php", "LinkTitles\\TestCase": "tests/phpunit/TestCase.php" @@ -64,7 +65,6 @@ "LinkTitles\\Extension::onParserFirstCallInit" ] }, - "callback": "LinkTitles\\Extension::setup", "ExtensionMessagesFiles": { "LinkTitlesMagic": "includes/Magic.php" }, diff --git a/includes/Config.php b/includes/Config.php index 3c4702e..5264323 100644 --- a/includes/Config.php +++ b/includes/Config.php @@ -96,6 +96,43 @@ class Config { */ public $capitalLinks; + /** + * Whether or not to link to pages only if the page title appears at the + * start of a word on the target page (i.e., link 'MediaWiki' to a page + * 'Media', but not to a page 'Wiki'). + * + * Set both $wordStartOnly and $wordEndOnly to true to enforce matching + * whole titles. + * + * @var bool $wordStartOnly; + */ + public $wordStartOnly; + + /** + * Whether or not to link to pages only if the page title appears at the + * end of a word on the target page (i.e., link 'MediaWiki' to a page + * 'Wiki', but not to a page 'Media'). + * + * Set both $wordStartOnly and $wordEndOnly to true to enforce matching + * whole titles. + * + * @var bool $wordEndOnly; + */ + public $wordEndOnly; + + /** + * Whether or not to skip templates. If set to true, text inside transclusions + * will not be linked. + * @var bool $skipTemplates + */ + public $skipTemplates; + + /** + * Whether or not to parse headings. + * @var bool $parseHeadings + */ + public $parseHeadings; + /** * Constructs a new Config object. * @@ -112,6 +149,10 @@ class Config { global $wgLinkTitlesFirstOnly; global $wgLinkTitlesSmartMode; global $wgCapitalLinks; + global $wgLinkTitlesWordStartOnly; + global $wgLinkTitlesWordEndOnly; + global $wgLinkTitlesSkipTemplates; + global $wgLinkTitlesParseHeadings; $this->parseOnEdit = $wgLinkTitlesParseOnEdit; $this->parseOnRender = $wgLinkTitlesParseOnRender; $this->preferShortTitles = $wgLinkTitlesPreferShortTitles; @@ -121,6 +162,10 @@ class Config { $this->firstOnly = $wgLinkTitlesFirstOnly; $this->smartMode = $wgLinkTitlesSmartMode; $this->capitalLinks = $wgCapitalLinks; // MediaWiki global variable + $this->wordStartOnly = $wgLinkTitlesWordStartOnly; + $this->wordEndOnly = $wgLinkTitlesWordEndOnly; + $this->skipTemplates = $wgLinkTitlesSkipTemplates; + $this->parseHeadings = $wgLinkTitlesParseHeadings; } } diff --git a/includes/Delimiters.php b/includes/Delimiters.php new file mode 100644 index 0000000..d6b513d --- /dev/null +++ b/includes/Delimiters.php @@ -0,0 +1,138 @@ + ('bovender') + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + * + * @author Daniel Kraus + */ +namespace LinkTitles; + +/** + * Caches a regular expression that delimits text to be parsed. + */ +class Delimiters { + private static $instance; + + /** + * Singleton factory. + * + * @param Config $config LinkTitles configuration. + */ + public static function default( Config $config ) { + if ( self::$instance === null ) { + self::$instance = new Delimiters( $config ); + } + return self::$instance; + } + + /** + * Invalidates the singleton instance. + * + * Used for unit testing. + */ + public static function invalidate() { + self::$instance = null; + } + + protected function __construct( Config $config) { + $this->config = $config; + $this->buildDelimiters(); + } + + /** + * The splitting expression that separates text to be parsed from text that + * must not be parsed. + * @var String $splitter + */ + public $splitter; + + /** + * Regex that matches the start of a word; this expression depends on the + * setting of LinkTitles\Config->wordStartOnly; + * @var String $wordStart + */ + public $wordStart; + + /** + * Regex that matches the end of a word; this expression depends on the + * setting of LinkTitles\Config->wordEndOnly; + * @var String $wordEnd + */ + public $wordEnd; + + private $config; + + /* + * Builds the delimiter that is used in a regexp to separate + * text that should be parsed from text that should not be + * parsed (e.g. inside existing links etc.) + */ + private function buildDelimiters() { + // Use unicode character properties rather than \b escape sequences + // to detect whole words containing non-ASCII characters as well. + // Note that this requires a PCRE library that was compiled with + // --enable-unicode-properties + ( $this->config->wordStartOnly ) ? $this->wordStart = '(?wordStart = ''; + ( $this->config->wordEndOnly ) ? $this->wordEnd = '(?!\pL)' : $this->wordEnd = ''; + + if ( $this->config->skipTemplates ) + { + // Use recursive regex to balance curly braces; + // see http://www.regular-expressions.info/recurse.html + $templatesDelimiter = '{{(?>[^{}]|(?R))*}}|'; + } else { + // Match template names (ignoring any piped [[]] links in them) + // along with the trailing pipe and parameter name or closing + // braces; also match sequences of '|wordcharacters=' (without + // spaces in them) that usually only occur as parameter names in + // transclusions (but could also occur as wiki table cell contents). + // TODO: Find a way to match parameter names in transclusions, but + // not in table cells or other sequences involving a pipe character + // and equal sign. + $templatesDelimiter = '{{[^|]*?(?:(?:\[\[[^]]+]])?)[^|]*?(?:\|(?:\w+=)?|(?:}}))|\|\w+=|'; + } + + // Build a regular expression that will capture existing wiki links ("[[...]]"), + // wiki headings ("= ... =", "== ... ==" etc.), + // urls ("http://example.com", "[http://example.com]", "[http://example.com Description]", + // and email addresses ("mail@example.com"). + // Since there is a user option to skip headings, we make this part of the expression + // optional. Note that in order to use preg_split(), it is important to have only one + // capturing subpattern (which precludes the use of conditional subpatterns). + ( $this->config->parseHeadings ) ? $delimiter = '' : $delimiter = '=+.+?=+|'; + $urlPattern = '[a-z]+?\:\/\/(?:\S+\.)+\S+(?:\/.*)?'; + $this->splitter = '/(' . // exclude from linking: + '\[\[.*?\]\]|' . // links + $delimiter . // titles (if requested) + $templatesDelimiter . // templates (if requested) + '^ .+?\n|\n .+?\n|\n .+?$|^ .+?$|' . // preformatted text + '.*?<.nowiki>|.*?<\/code>|' . // nowiki/code + '
.*?<\/pre>|.*?<\/html>|' .      // pre/html
+			'