Fix word detection with non-ASCII characters.

Use the unicode character property \pL together with look-behind and
look-ahead assertions rather than '\b' escape sequences (which do not work
with non-ASCII characters).
This commit is contained in:
Daniel Kraus
2014-08-28 15:57:49 +02:00
parent 10f528e265
commit c800862990

View File

@ -104,8 +104,13 @@
global $wgLinkTitlesSmartMode; global $wgLinkTitlesSmartMode;
global $wgCapitalLinks; global $wgCapitalLinks;
( $wgLinkTitlesWordStartOnly ) ? $wordStartDelim = '\b' : $wordStartDelim = ''; // Use unicode character properties rather than \b escape sequences
( $wgLinkTitlesWordEndOnly ) ? $wordEndDelim = '\b' : $wordEndDelim = ''; // to detect whole words containing non-ASCII characters as well.
// Note that this requires the use of the '/u' switch, and you need
// to have PHP with a PCRE library that was compiled with
// --enable-unicode-properties
( $wgLinkTitlesWordStartOnly ) ? $wordStartDelim = '(?<!\pL)' : $wordStartDelim = '';
( $wgLinkTitlesWordEndOnly ) ? $wordEndDelim = '(?!\pL)' : $wordEndDelim = '';
( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC'; ( $wgLinkTitlesPreferShortTitles ) ? $sort_order = 'ASC' : $sort_order = 'DESC';
( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1; ( $wgLinkTitlesFirstOnly ) ? $limit = 1 : $limit = -1;
@ -209,7 +214,7 @@
for ( $i = 0; $i < count( $arr ); $i+=2 ) { for ( $i = 0; $i < count( $arr ); $i+=2 ) {
// even indexes will point to text that is not enclosed by brackets // even indexes will point to text that is not enclosed by brackets
$arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' . $arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' .
$wordStartDelim . $searchTerm . $wordEndDelim . '/', $wordStartDelim . $searchTerm . $wordEndDelim . '/u',
array('LinkTitles', 'simpleModeCallback'), $arr[$i], $limit, $count ); array('LinkTitles', 'simpleModeCallback'), $arr[$i], $limit, $count );
if (( $limit >= 0 ) && ( $count > 0 )) { if (( $limit >= 0 ) && ( $count > 0 )) {
break; break;
@ -227,7 +232,7 @@
// even indexes will point to text that is not enclosed by brackets // even indexes will point to text that is not enclosed by brackets
$arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' . $arr[$i] = preg_replace_callback( '/(?<![\:\.\@\/\?\&])' .
$wordStartDelim . '(' . $quotedTitle . ')' . $wordStartDelim . '(' . $quotedTitle . ')' .
$wordEndDelim . '/i', array('LinkTitles', 'smartModeCallback'), $wordEndDelim . '/iu', array('LinkTitles', 'smartModeCallback'),
$arr[$i], $limit, $count ); $arr[$i], $limit, $count );
if (( $limit >= 0 ) && ( $count > 0 )) { if (( $limit >= 0 ) && ( $count > 0 )) {
break; break;