Improved captcha word generation; fixed duplicate auth parameter in Special:Login privileged login; improved search indexer performance on websites with lots of words
--- a/includes/functions.php Sun Jul 25 11:23:09 2010 -0400
+++ b/includes/functions.php Mon Jul 26 20:10:01 2010 -0400
@@ -2545,14 +2545,14 @@
}
/**
- * Paginates (breaks into multiple pages) a MySQL result resource, which is treated as unbuffered.
- * @param resource The MySQL result resource. This should preferably be an unbuffered query.
+ * Paginates (breaks into multiple pages) a database result resource, which is treated as unbuffered.
+ * @param resource The result resource. This should preferably be an unbuffered query, which allows scalability across very large result sets.
* @param string A template, with variables being named after the column name
* @param int The number of total results. This should be determined by a second query.
* @param string sprintf-style formatting string for URLs for result pages. First parameter will be start offset.
* @param int Optional. Start offset in individual results. Defaults to 0.
* @param int Optional. The number of results per page. Defualts to 10.
- * @param int Optional. An associative array of functions to call, with key names being column names, and values being function names. Values can also be an array with key 0 being either an object or a string(class name) and key 1 being a [static] method.
+ * @param array Optional. An associative array of functions to call, with key names being column names, and values being callbacks (string or array(string, string) or array(object, string)). They can also be closures if you're OK with incompatibility with PHP <5.3.0.
* @param string Optional. The text to be sent before the result list, only if there are any results. Possibly the start of a table.
* @param string Optional. The text to be sent after the result list, only if there are any results. Possibly the end of a table.
* @return string
--- a/includes/paths.php Sun Jul 25 11:23:09 2010 -0400
+++ b/includes/paths.php Mon Jul 26 20:10:01 2010 -0400
@@ -726,6 +726,32 @@
}
/**
+ * Get the unique words on a page. Returns an array listing all items in small array $arr1 that are not in very large array $arr2.
+ * @param array
+ * @param array
+ * @return array
+ */
+
+ function get_unique_words($arr1, $arr2)
+ {
+ $no = array();
+ foreach ( $arr2 as $w )
+ {
+ if ( ($k = array_search($w, $arr1, true)) !== false )
+ {
+ $no[$k] = true;
+ }
+ }
+ $ret = array();
+ foreach ( $arr1 as $k => $w )
+ {
+ if ( !isset($no[$k]) )
+ $ret[] = $w;
+ }
+ return $ret;
+ }
+
+ /**
* Builds a word list for search indexing.
* @param string Text to index
* @param string Page ID of the page being indexed
@@ -863,17 +889,22 @@
$page_uniqid = $db->escape($page_uniqid);
// List of words on the page
+ if ( $debug )
+ echo "wordlist...";
$wordlist = $this->calculate_word_list($row['page_text'], $row['page_id'], $row['name']);
// Index calculation complete -- run inserts
$inserts = array();
+ $qt = array();
+ $unique_words = $this->get_unique_words($wordlist, $master_word_list);
foreach ( $wordlist as $word )
{
+ $qs = microtime_float();
if ( in_array($word, $stopwords) || strval(intval($word)) === $word || strlen($word) < 3 )
continue;
$word_db = $db->escape($word);
$word_db_lc = $db->escape(strtolower($word));
- if ( !in_array($word, $master_word_list) )
+ if ( in_array($word, $unique_words) )
{
$inserts[] = "( '$word_db', '$word_db_lc', '$page_uniqid' )";
}
@@ -888,7 +919,10 @@
if ( !$q )
$db->_die();
}
+ $qt[] = microtime_float() - $qs;
}
+ if ( $debug && count($qt) > 0 )
+ echo "QT: " . number_format(array_sum($qt) / count($qt), 4) . " * " . count($qt) . '; wl_len: ' . count($master_word_list) .' ';
if ( count($inserts) > 0 )
{
if ( $verbose && $debug )
@@ -899,14 +933,14 @@
$db->_die();
}
- $master_word_list = array_unique(array_merge($master_word_list, $wordlist));
+ $master_word_list = array_merge($master_word_list, $unique_words);
if ( $verbose )
{
if ( isset($_SERVER['REQUEST_URI']) )
echo '<br />';
echo "\n";
}
- unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row);
+ unset($inserts, $wordlist, $page_uniqid, $word_db, $q, $word, $row, $unique_words);
}
while ( $row = $db->fetchrow($texts) );
}
--- a/includes/sessions.php Sun Jul 25 11:23:09 2010 -0400
+++ b/includes/sessions.php Mon Jul 26 20:10:01 2010 -0400
@@ -3481,6 +3481,12 @@
$word .= 't';
else if ( $prev_l == 'p' && mt_rand(0, 5) == 1 )
$word .= 'h';
+ // this rule allows "ck" which can result in the occasional "dick", "fuck", etc. that tends
+ // to end up on 4chan, but I decided to keep it, because it increases word complexity.
+ else if ( $prev_l == 'c' && mt_rand(0, 3) == 1 )
+ $word .= 'k';
+ else if ( $prev_l == 'q' && mt_rand(0, 5) != 1 )
+ $word .= 'u';
else
$word .= $vowels{mt_rand(0, (strlen($vowels)-1))};
}
--- a/plugins/SpecialUserFuncs.php Sun Jul 25 11:23:09 2010 -0400
+++ b/plugins/SpecialUserFuncs.php Mon Jul 26 20:10:01 2010 -0400
@@ -391,7 +391,8 @@
$get_add = '';
foreach ( $get_fwd as $key => $value )
{
- $get_add .= "&{$key}=" . urlencode($value);
+ if ( $key != 'auth' )
+ $get_add .= "&{$key}=" . urlencode($value);
}
$get_add = ltrim($get_add, '&');
}