AJAX core library: possible breaking change, readystatechange functions are now called with the XHR instance as the first parameter, to allow requests to run in parallel. This means much better stability but may break some applets (compatibility hack is included)
<?php
/*
* Enano - an open-source CMS capable of wiki functions, Drupal-like sidebar blocks, and everything in between
* Version 1.1.6 (Caoineag beta 1)
* Copyright (C) 2006-2008 Dan Fuhry
* search.php - algorithm used to search pages
*
* This program is Free Software; you can redistribute and/or modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied
* warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for details.
*/
/**
* In Enano versions prior to 1.0.2, this class provided a search function that was keyword-based and allowed boolean searches. It was
* cut from Coblynau and replaced with perform_search(), later in this file, because of speed issues. Now mostly deprecated. The only
* thing remaining is the buildIndex function, which is still used by the path manager and the new search framework.
*
* @package Enano
* @subpackage Page management frontend
* @license GNU General Public License <http://enanocms.org/Special:GNU_General_Public_License>
*/
class Searcher
{
var $results;
var $index;
var $warnings;
var $match_case = false;
function buildIndex($texts)
{
$this->index = Array();
$stopwords = get_stopwords();
foreach($texts as $i => $l)
{
$seed = md5(microtime(true) . mt_rand());
$texts[$i] = str_replace("'", 'xxxApoS'.$seed.'xxx', $texts[$i]);
$texts[$i] = preg_replace('#([\W_]+)#i', ' ', $texts[$i]);
$texts[$i] = preg_replace('#([ ]+?)#', ' ', $texts[$i]);
$texts[$i] = preg_replace('#([\']*){2,}#s', '', $texts[$i]);
$texts[$i] = str_replace('xxxApoS'.$seed.'xxx', "'", $texts[$i]);
$l = $texts[$i];
$words = Array();
$good_chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789\' ';
$good_chars = enano_str_split($good_chars, 1);
$letters = enano_str_split($l, 1);
foreach($letters as $x => $t)
{
if(!in_array($t, $good_chars))
unset($letters[$x]);
}
$letters = implode('', $letters);
$words = explode(' ', $letters);
foreach($words as $c => $w)
{
if(strlen($w) < 2 || in_array($w, $stopwords) || strlen($w) > 63 || preg_match('/[\']{2,}/', $w))
unset($words[$c]);
else
$words[$c] = $w;
}
$words = array_values($words);
foreach($words as $c => $w)
{
if(isset($this->index[$w]))
{
if(!in_array($i, $this->index[$w]))
$this->index[$w][] = $i;
}
else
{
$this->index[$w] = Array();
$this->index[$w][] = $i;
}
}
}
foreach($this->index as $k => $v)
{
$this->index[$k] = implode(',', $this->index[$k]);
}
}
}
/**
* Searches the site for the specified string and returns an array with each value being an array filled with the following:
* page_id: string, self-explanatory
* namespace: string, self-explanatory
* page_length: integer, the length of the full page in bytes
* page_text: string, the contents of the page (trimmed to ~150 bytes if necessary)
* score: numerical relevance score, 1-100, rounded to 2 digits and calculated based on which terms were present and which were not
* @param string Search query
* @param string|reference Will be filled with any warnings encountered whilst parsing the query
* @param bool Case sensitivity - defaults to false
* @param array|reference Will be filled with the parsed list of words.
* @return array
*/
function perform_search($query, &$warnings, $case_sensitive = false, &$word_list)
{
global $db, $session, $paths, $template, $plugins; // Common objects
global $lang;
$warnings = array();
//
// STAGE 0: PARSE SEARCH QUERY
// Identify all terms of the query. Separate between what is required and what is not, and what should be sent through the index as
// opposed to straight-out LIKE-selected.
//
$query = parse_search_query($query, $warnings);
// Segregate search terms containing spaces
$query_phrase = array(
'any' => array(),
'req' => array()
);
foreach ( $query['any'] as $i => $_ )
{
$term =& $query['any'][$i];
$term = trim($term);
// the indexer only indexes words a-z with apostrophes
if ( preg_match('/[^A-Za-z\']/', $term) )
{
$query_phrase['any'][] = $term;
unset($term, $query['any'][$i]);
}
}
unset($term);
$query['any'] = array_values($query['any']);
foreach ( $query['req'] as $i => $_ )
{
$term =& $query['req'][$i];
$term = trim($term);
if ( preg_match('/[^A-Za-z\']/', $term) )
{
$query_phrase['req'][] = $term;
unset($term, $query['req'][$i]);
}
}
unset($term);
$query['req'] = array_values($query['req']);
$results = array();
$scores = array();
$ns_list = '(' . implode('|', array_keys($paths->nslist)) . ')';
// FIXME: Update to use FULLTEXT algo when available.
// Build an SQL query to load from the index table
if ( count($query['any']) < 1 && count($query['req']) < 1 && count($query_phrase['any']) < 1 && count($query_phrase['req']) < 1 )
{
// This is both because of technical restrictions and devastation that would occur on shared servers/large sites.
$warnings[] = $lang->get('search_err_query_no_positive');
return array();
}
//
// STAGE 1
// Get all possible result pages from the search index. Tally which pages have the most words, and later sort them by boolean relevance
//
// Skip this if no indexable words are included
if ( count($query['any']) > 0 || count($query['req']) > 0 )
{
$where_any = array();
foreach ( $query['any'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_any[] = $term;
}
foreach ( $query['req'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_any[] = $term;
}
$col_word = ( $case_sensitive ) ? 'word' : 'word_lcase';
$where_any = ( count($where_any) > 0 ) ? '( ' . $col_word . ' = \'' . implode('\' OR ' . $col_word . ' = \'', $where_any) . '\' )' : '';
// generate query
// using a GROUP BY here ensures that the same word with a different case isn't counted as 2 words - it's all melted back
// into one later in the processing stages
// $group_by = ( $case_sensitive ) ? '' : ' GROUP BY lcase(word);';
$sql = "SELECT word, page_names FROM " . table_prefix . "search_index WHERE {$where_any}";
if ( !($q = $db->sql_unbuffered_query($sql)) )
$db->_die('Error is in perform_search(), includes/search.php, query 1');
$word_tracking = array();
if ( $row = $db->fetchrow() )
{
do
{
// get page list
$pages =& $row['page_names'];
if ( strpos($pages, ',') )
{
// the term occurs in more than one page
// Find page IDs that contain commas
// This should never happen because commas are escaped by sanitize_page_id(). Nevertheless for compatibility with older
// databases, and to alleviate the concerns of hackers, we'll accommodate for page IDs with commas here by checking for
// IDs that don't match the pattern for stringified page ID + namespace. If it doesn't match, that means it's a continuation
// of the previous ID and should be concatenated to the previous entry.
$matches = explode(',', $pages);
$prev = false;
foreach ( $matches as $i => $_ )
{
$match =& $matches[$i];
if ( !preg_match("/^ns=$ns_list;pid=(.+)$/", $match) && $prev )
{
$matches[$prev] .= ',' . $match;
unset($match, $matches[$i]);
continue;
}
$prev = $i;
}
unset($match);
// Iterate through each of the results, assigning scores based on how many times the page has shown up.
// This works because this phase of the search is strongly word-based not page-based. If a page shows up
// multiple times while fetching the result rows from the search_index table, it simply means that page
// contains more than one of the terms the user searched for.
foreach ( $matches as $match )
{
$word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
if ( isset($word_tracking[$match]) && in_array($word_cs, $word_tracking[$match]) )
{
continue;
}
if ( isset($word_tracking[$match]) )
{
if ( isset($word_tracking[$match]) )
{
$word_tracking[$match][] = ($word_cs);
}
}
else
{
$word_tracking[$match] = array($word_cs);
}
$inc = 1;
// Is this search term present in the page's title? If so, give extra points
preg_match("/^ns=$ns_list;pid=(.+)$/", $match, $piecesparts);
$pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
if ( isset($paths->pages[$pathskey]) )
{
$test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) )
{
$inc = 1.5;
}
}
if ( isset($scores[$match]) )
{
$scores[$match] = $scores[$match] + $inc;
}
else
{
$scores[$match] = $inc;
}
}
}
else
{
// the term only occurs in one page
$word_cs = (( $case_sensitive ) ? $row['word'] : strtolower($row['word']));
if ( isset($word_tracking[$pages]) && in_array($word_cs, $word_tracking[$pages]) )
{
continue;
}
if ( isset($word_tracking[$pages]) )
{
if ( isset($word_tracking[$pages]) )
{
$word_tracking[$pages][] = ($word_cs);
}
}
else
{
$word_tracking[$pages] = array($word_cs);
}
$inc = 1;
// Is this search term present in the page's title? If so, give extra points
preg_match("/^ns=$ns_list;pid=(.+)$/", $pages, $piecesparts);
$pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
if ( isset($paths->pages[$pathskey]) )
{
$test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
if ( $test_func($paths->pages[$pathskey]['name'], $row['word']) || $test_func($paths->pages[$pathskey]['urlname_nons'], $row['word']) )
{
$inc = 1.5;
}
}
if ( isset($scores[$pages]) )
{
$scores[$pages] = $scores[$pages] + $inc;
}
else
{
$scores[$pages] = $inc;
}
}
}
while ( $row = $db->fetchrow() );
}
$db->free_result();
//
// STAGE 2: FIRST ELIMINATION ROUND
// Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
//
foreach ( $query['req'] as $term )
{
foreach ( $word_tracking as $i => $page )
{
if ( !in_array($term, $page) )
{
unset($word_tracking[$i], $scores[$i]);
}
}
}
}
//
// STAGE 3: PHRASE SEARCHING
// Use LIKE to find pages with specified phrases. We can do a super-picky single query without another elimination round because
// at this stage we can search the full page_text column instead of relying on a word list.
//
// We can skip this stage if none of these special terms apply
$text_col = ( $case_sensitive ) ? 'page_text' : ENANO_SQLFUNC_LOWERCASE . '(page_text)';
$name_col = ( $case_sensitive ) ? 'name' : ENANO_SQLFUNC_LOWERCASE . '(name)';
$text_col_join = ( $case_sensitive ) ? 't.page_text' : ENANO_SQLFUNC_LOWERCASE . '(t.page_text)';
$name_col_join = ( $case_sensitive ) ? 'p.name' : ENANO_SQLFUNC_LOWERCASE . '(p.name)';
$concat_column = ( ENANO_DBLAYER == 'MYSQL' ) ?
'CONCAT(\'ns=\',t.namespace,\';pid=\',t.page_id)' :
"'ns=' || t.namespace || ';pid=' || t.page_id";
if ( count($query_phrase['any']) > 0 || count($query_phrase['req']) > 0 )
{
$where_any = array();
foreach ( $query_phrase['any'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_any[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )";
}
$where_any = ( count($where_any) > 0 ) ? implode(" OR\n ", $where_any) : '';
// Also do required terms, but use AND to ensure that all required terms are included
$where_req = array();
foreach ( $query_phrase['req'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_req[] = "( $text_col LIKE '%$term%' OR $name_col LIKE '%$term%' )";
}
$and_clause = ( $where_any != '' ) ? 'AND ' : '';
$where_req = ( count($where_req) > 0 ) ? "{$and_clause}" . implode(" AND\n ", $where_req) : '';
$sql = 'SELECT ' . $concat_column . ' AS id, p.name FROM ' . table_prefix . "page_text AS t\n"
. " LEFT JOIN " . table_prefix . "pages AS p\n"
. " ON ( p.urlname = t.page_id AND p.namespace = t.namespace )\n"
. " WHERE p.visible = 1 AND (\n $where_any\n $where_req\n );";
if ( !($q = $db->sql_unbuffered_query($sql)) )
$db->_die('Error is in perform_search(), includes/search.php, query 2. Parsed query dump follows:<pre>(indexable) ' . htmlspecialchars(print_r($query, true)) . '(non-indexable) ' . htmlspecialchars(print_r($query_phrase, true)) . '</pre>');
if ( $row = $db->fetchrow() )
{
do
{
$id =& $row['id'];
$inc = 1;
// Is this search term present in the page's title? If so, give extra points
preg_match("/^ns=$ns_list;pid=(.+)$/", $id, $piecesparts);
$pathskey = $paths->nslist[ $piecesparts[1] ] . sanitize_page_id($piecesparts[2]);
if ( isset($paths->pages[$pathskey]) )
{
$test_func = ( $case_sensitive ) ? 'strstr' : 'stristr';
foreach ( array_merge($query_phrase['any'], $query_phrase['req']) as $term )
{
if ( $test_func($paths->pages[$pathskey]['name'], $term) || $test_func($paths->pages[$pathskey]['urlname_nons'], $term) )
{
$inc = 1.5;
break;
}
}
}
if ( isset($scores[$id]) )
{
$scores[$id] = $scores[$id] + $inc;
}
else
{
$scores[$id] = $inc;
}
}
while ( $row = $db->fetchrow() );
}
$db->free_result();
}
//
// STAGE 4 - SELECT PAGE TEXT AND ELIMINATE NOTS
// At this point, we have a complete list of all the possible pages. Now we want to obtain the page text, and within the same query
// eliminate any terms that shouldn't be in there.
//
// Generate master word list for the highlighter
$word_list = array_values(array_merge($query['any'], $query['req'], $query_phrase['any'], $query_phrase['req']));
$text_where = array();
foreach ( $scores as $page_id => $_ )
{
$text_where[] = $db->escape($page_id);
}
$text_where = '( ' . $concat_column . ' = \'' . implode('\' OR ' . $concat_column . ' = \'', $text_where) . '\' )';
if ( count($query['not']) > 0 )
$text_where .= ' AND';
$where_not = array();
foreach ( $query['not'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_not[] = $term;
}
$where_not = ( count($where_not) > 0 ) ? "$text_col NOT LIKE '%" . implode("%' AND $text_col NOT LIKE '%", $where_not) . "%'" : '';
$sql = 'SELECT ' . $concat_column . ' AS id, t.page_id, t.namespace, CHAR_LENGTH(t.page_text) AS page_length, t.page_text, p.name AS page_name FROM ' . table_prefix . "page_text AS t
LEFT JOIN " . table_prefix . "pages AS p
ON ( p.urlname = t.page_id AND p.namespace = t.namespace )
WHERE p.visible = 1 AND ( $text_where $where_not );";
if ( !($q = $db->sql_unbuffered_query($sql)) )
$db->_die('Error is in perform_search(), includes/search.php, query 3');
$page_data = array();
if ( $row = $db->fetchrow() )
{
do
{
$row['page_text'] = htmlspecialchars($row['page_text']);
$row['page_name'] = htmlspecialchars($row['page_name']);
// Highlight results (this is wonderfully automated)
$row['page_text'] = highlight_and_clip_search_result($row['page_text'], $word_list, $case_sensitive);
if ( strlen($row['page_text']) > 250 && !preg_match('/^\.\.\.(.+)\.\.\.$/', $row['page_text']) )
{
$row['page_text'] = substr($row['page_text'], 0, 150) . '...';
}
$row['page_name'] = highlight_search_result($row['page_name'], $word_list, $case_sensitive);
$page_data[$row['id']] = $row;
}
while ( $row = $db->fetchrow() );
}
$db->free_result();
//
// STAGE 5 - SPECIAL PAGE TITLE SEARCH
// Iterate through $paths->pages and check the titles for search terms. Score accordingly.
//
foreach ( $paths->pages as $id => $page )
{
if ( $page['namespace'] != 'Special' || $page['visible'] == 0 )
continue;
$idstring = 'ns=' . $page['namespace'] . ';pid=' . $page['urlname_nons'];
$any = array_values(array_unique(array_merge($query['any'], $query_phrase['any'])));
foreach ( $any as $term )
{
if ( $case_sensitive )
{
if ( strstr($page['name'], $term) || strstr($page['urlname_nons'], $term) )
{
( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5;
}
}
else
{
if ( stristr($page['name'], $term) || stristr($page['urlname_nons'], $term) )
{
( isset($scores[$idstring]) ) ? $scores[$idstring] = $scores[$idstring] + 1.5 : $scores[$idstring] = 1.5;
}
}
}
if ( isset($scores[$idstring]) )
{
$page_data[$idstring] = array(
'page_name' => highlight_search_result($page['name'], $word_list, $case_sensitive),
'page_text' => '',
'page_id' => $page['urlname_nons'],
'namespace' => $page['namespace'],
'score' => $scores[$idstring],
'page_length' => 1,
'page_note' => '[' . $lang->get('search_result_tag_special') . ']'
);
}
}
//
// STAGE 6 - SECOND ELIMINATION ROUND
// Iterate through the list of required terms. If a given page is not found to have the required term, eliminate it
//
$required = array_merge($query['req'], $query_phrase['req']);
foreach ( $required as $term )
{
foreach ( $page_data as $id => $page )
{
if ( ( $page['namespace'] == 'Special' || ( $page['namespace'] != 'Special' && !strstr($page['page_text'], $term) ) ) && !strstr($page['page_id'], $term) && !strstr($page['page_name'], $term) )
{
unset($page_data[$id]);
}
}
}
// At this point, all of our normal results are in. However, we can also allow plugins to hook into the system and score their own
// pages and add text, etc. as necessary.
// Plugins are COMPLETELY responsible for using the search terms and handling Boolean logic properly
inject_custom_search_results($query, $query_phrase, $scores, $page_data, $case_sensitive, $word_list);
$code = $plugins->setHook('search_global_inner');
foreach ( $code as $cmd )
{
eval($cmd);
}
// a marvelous debugging aid :-)
// die('<pre>' . htmlspecialchars(print_r($page_data, true)) . '</pre>');
//
// STAGE 7 - HIGHLIGHT, TRIM, AND SCORE RESULTS
// We now have the complete results of the search. We need to trim text down to show only portions of the page containing search
// terms, highlight any search terms within the page, and sort the final results array in descending order of score.
//
// Sort scores array
arsort($scores);
// Divisor for calculating relevance scores
$divisor = ( count($query['any']) + count($query_phrase['any']) + count($query['req']) + count($query['not']) ) * 1.5;
foreach ( $scores as $page_id => $score )
{
if ( !isset($page_data[$page_id]) )
// It's possible that $scores contains a score for a page that was later eliminated because it contained a disallowed term
continue;
// Make a copy of the datum, then delete the original (it frees up a LOT of RAM)
$datum = $page_data[$page_id];
unset($page_data[$page_id]);
// This is an internal value used for sorting - it's no longer needed.
unset($datum['id']);
// Calculate score
// if ( $score > $divisor )
// $score = $divisor;
$datum['score'] = round($score / $divisor, 2) * 100;
// Highlight the URL
$datum['url_highlight'] = makeUrlComplete($datum['namespace'], $datum['page_id']);
$datum['url_highlight'] = preg_replace('/\?.+$/', '', $datum['url_highlight']);
$datum['url_highlight'] = highlight_search_result($datum['url_highlight'], $word_list, $case_sensitive);
// Store it in our until-now-unused results array
$results[] = $datum;
}
// Our work here is done. :-D
return $results;
}
/**
* Parses a search query into an associative array. The resultant array will be filled with the following values, each an array:
* any: Search terms that can optionally be present
* req: Search terms that must be present
* not: Search terms that should not be present
* @param string Search query
* @param array Will be filled with parser warnings, such as query too short, words too short, etc.
* @return array
*/
function parse_search_query($query, &$warnings)
{
global $lang;
$stopwords = get_stopwords();
$ret = array(
'any' => array(),
'req' => array(),
'not' => array()
);
$warnings = array();
$terms = array();
$in_quote = false;
$start_term = 0;
$just_finished = false;
for ( $i = 0; $i < strlen($query); $i++ )
{
$chr = $query{$i};
$prev = ( $i > 0 ) ? $query{ $i - 1 } : '';
$next = ( ( $i + 1 ) < strlen($query) ) ? $query{ $i + 1 } : '';
if ( ( $chr == ' ' && !$in_quote ) || ( $i + 1 == strlen ( $query ) ) )
{
$len = ( $next == '' ) ? $i + 1 : $i - $start_term;
$word = substr ( $query, $start_term, $len );
$terms[] = $word;
$start_term = $i + 1;
}
elseif ( $chr == '"' && $in_quote && $prev != '\\' )
{
$word = substr ( $query, $start_term, $i - $start_term + 1 );
$start_pos = ( $next == ' ' ) ? $i + 2 : $i + 1;
$in_quote = false;
}
elseif ( $chr == '"' && !$in_quote )
{
$in_quote = true;
$start_pos = $i;
}
}
$ticker = 0;
foreach ( $terms as $element => $__unused )
{
$atom =& $terms[$element];
$ticker++;
if ( $ticker == 20 )
{
$warnings[] = $lang->get('search_err_query_too_many_terms');
break;
}
if ( substr ( $atom, 0, 2 ) == '+"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
{
$word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
{
$warnings[] = $lang->get('search_err_query_has_stopwords');
$ticker--;
continue;
}
if(in_array($word, $ret['req']))
{
$warnings[] = $lang->get('search_err_query_dup_terms');
$ticker--;
continue;
}
$ret['req'][] = $word;
}
elseif ( substr ( $atom, 0, 2 ) == '-"' && substr ( $atom, ( strlen ( $atom ) - 1 ), 1 ) == '"' )
{
$word = substr ( $atom, 2, ( strlen( $atom ) - 3 ) );
if ( strlen ( $word ) < 4 )
{
$warnings[] = $lang->get('search_err_query_term_too_short');
$ticker--;
continue;
}
if(in_array($word, $ret['not']))
{
$warnings[] = $lang->get('search_err_query_dup_terms');
$ticker--;
continue;
}
$ret['not'][] = $word;
}
elseif ( substr ( $atom, 0, 1 ) == '+' )
{
$word = substr ( $atom, 1 );
if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
{
$warnings[] = $lang->get('search_err_query_has_stopwords');
$ticker--;
continue;
}
if(in_array($word, $ret['req']))
{
$warnings[] = $lang->get('search_err_query_dup_terms');
$ticker--;
continue;
}
$ret['req'][] = $word;
}
elseif ( substr ( $atom, 0, 1 ) == '-' )
{
$word = substr ( $atom, 1 );
if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
{
$warnings[] = $lang->get('search_err_query_has_stopwords');
$ticker--;
continue;
}
if(in_array($word, $ret['not']))
{
$warnings[] = $lang->get('search_err_query_dup_terms');
$ticker--;
continue;
}
$ret['not'][] = $word;
}
elseif ( substr ( $atom, 0, 1 ) == '"' && substr ( $atom, ( strlen($atom) - 1 ), 1 ) == '"' )
{
$word = substr ( $atom, 1, ( strlen ( $atom ) - 2 ) );
if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
{
$warnings[] = $lang->get('search_err_query_has_stopwords');
$ticker--;
continue;
}
if(in_array($word, $ret['any']))
{
$warnings[] = $lang->get('search_err_query_dup_terms');
$ticker--;
continue;
}
$ret['any'][] = $word;
}
else
{
$word = $atom;
if ( strlen ( $word ) < 2 || in_array($word, $stopwords) )
{
$warnings[] = $lang->get('search_err_query_has_stopwords');
$ticker--;
continue;
}
if(in_array($word, $ret['any']))
{
$warnings[] = $lang->get('search_err_query_dup_terms');
$ticker--;
continue;
}
$ret['any'][] = $word;
}
}
return $ret;
}
/**
* Escapes a string for use in a LIKE clause.
* @param string
* @return string
*/
function escape_string_like($string)
{
global $db, $session, $paths, $template, $plugins; // Common objects
$string = $db->escape($string);
$string = str_replace(array('%', '_'), array('\%', '\_'), $string);
return $string;
}
/**
* Wraps <highlight></highlight> tags around all words in both the specified array. Does not perform any clipping.
* @param string Text to process
* @param array Word list
* @param bool If true, searches case-sensitively when highlighting words
* @return string
*/
function highlight_search_result($pt, $words, $case_sensitive = false)
{
$words2 = array();
for ( $i = 0; $i < sizeof($words); $i++)
{
if(!empty($words[$i]))
$words2[] = preg_quote($words[$i]);
}
$flag = ( $case_sensitive ) ? '' : 'i';
$regex = '/(' . implode('|', $words2) . ')/' . $flag;
$pt = preg_replace($regex, '<highlight>\\1</highlight>', $pt);
return $pt;
}
/**
* Wraps <highlight></highlight> tags around all words in both the specified array and the specified text and clips the text to
* an appropriate length.
* @param string Text to process
* @param array Word list
* @param bool If true, searches case-sensitively when highlighting words
* @return string
*/
function highlight_and_clip_search_result($pt, $words, $case_sensitive = false)
{
$cut_off = false;
$space_chars = Array("\t", "\n", "\r", " ");
$pt = highlight_search_result($pt, $words, $case_sensitive);
foreach ( $words as $word )
{
// Boldface searched words
$ptlen = strlen($pt);
for ( $i = 0; $i < $ptlen; $i++ )
{
$len = strlen($word);
if ( strtolower(substr($pt, $i, $len)) == strtolower($word) )
{
$chunk1 = substr($pt, 0, $i);
$chunk2 = substr($pt, $i, $len);
$chunk3 = substr($pt, ( $i + $len ));
$pt = $chunk1 . $chunk2 . $chunk3;
$ptlen = strlen($pt);
// Cut off text to 150 chars or so
if ( !$cut_off )
{
$cut_off = true;
if ( $i - 75 > 0 )
{
// Navigate backwards until a space character is found
$chunk = substr($pt, 0, ( $i - 75 ));
$final_chunk = $chunk;
for ( $j = strlen($chunk) - 1; $j > 0; $j = $j - 1 )
{
if ( in_array($chunk{$j}, $space_chars) )
{
$final_chunk = substr($chunk, $j + 1);
break;
}
}
$mid_chunk = substr($pt, ( $i - 75 ), 75);
$clipped = '...' . $final_chunk . $mid_chunk . $chunk2;
$chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
$final_chunk = $chunk;
for ( $j = 0; $j < strlen($chunk); $j++ )
{
if ( in_array($chunk{$j}, $space_chars) )
{
$final_chunk = substr($chunk, 0, $j);
break;
}
}
$end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
$clipped .= $end_chunk . $final_chunk . '...';
$pt = $clipped;
}
else if ( strlen($pt) > 200 )
{
$mid_chunk = substr($pt, ( $i - 75 ), 75);
$clipped = $chunk1 . $chunk2;
$chunk = substr($pt, ( $i + strlen($chunk2) + 75 ));
$final_chunk = $chunk;
for ( $j = 0; $j < strlen($chunk); $j++ )
{
if ( in_array($chunk{$j}, $space_chars) )
{
$final_chunk = substr($chunk, 0, $j);
break;
}
}
$end_chunk = substr($pt, ( $i + strlen($chunk2) ), 75 );
$clipped .= $end_chunk . $final_chunk . '...';
$pt = $clipped;
}
break 2;
}
}
}
$cut_off = false;
}
return $pt;
}
/**
* Returns a list of words that shouldn't under most circumstances be indexed for searching.
* @return array
*/
function get_stopwords()
{
static $stopwords;
if ( is_array($stopwords) )
return $stopwords;
$stopwords = array('I', 'a', 'about', 'an', 'are', 'as', 'at', 'be', 'by', 'com', 'de', 'en', 'for', 'from', 'how', 'in', 'is', 'it',
'la', 'of', 'on', 'or', 'that', 'the', 'this', 'to', 'was', 'what', 'when', 'where', 'who', 'will', 'with', 'and',
'the');
return $stopwords;
}
/**
* Private function to inject custom results into a search.
*/
function inject_custom_search_results(&$query, &$query_phrase, &$scores, &$page_data, &$case_sensitive, &$word_list)
{
global $db, $session, $paths, $template, $plugins; // Common objects
global $lang;
global $search_handlers;
// global functions
$terms = array(
'any' => array_merge($query['any'], $query_phrase['any']),
'req' => array_merge($query['req'], $query_phrase['req']),
'not' => $query['not']
);
foreach ( $search_handlers as &$options )
{
$where = array('any' => array(), 'req' => array(), 'not' => array());
$where_any =& $where['any'];
$where_req =& $where['req'];
$where_not =& $where['not'];
$title_col = ( $case_sensitive ) ? $options['titlecolumn'] : ENANO_SQLFUNC_LOWERCASE . '(' . $options['titlecolumn'] . ')';
if ( isset($options['datacolumn']) )
$desc_col = ( $case_sensitive ) ? $options['datacolumn'] : ENANO_SQLFUNC_LOWERCASE . '(' . $options['datacolumn'] . ')';
else
$desc_col = "''";
foreach ( $terms['any'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_any[] = "( $title_col LIKE '%{$term}%' OR $desc_col LIKE '%{$term}%' )";
}
foreach ( $terms['req'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_req[] = "( $title_col LIKE '%{$term}%' OR $desc_col LIKE '%{$term}%' )";
}
foreach ( $terms['not'] as $term )
{
$term = escape_string_like($term);
if ( !$case_sensitive )
$term = strtolower($term);
$where_not[] = "$title_col NOT LIKE '%{$term}%' AND $desc_col NOT LIKE '%{$term}%'";
}
if ( empty($where_any) )
unset($where_any, $where['any']);
if ( empty($where_req) )
unset($where_req, $where['req']);
if ( empty($where_not) )
unset($where_not, $where['not']);
$where_any = '(' . implode(' OR ', $where_any) . '' . ( isset($where['req']) || isset($where['not']) ? ' OR 1 = 1' : '' ) . ')';
if ( isset($where_req) )
$where_req = implode(' AND ', $where_req);
if ( isset($where_not) )
$where_not = implode( 'AND ', $where_not);
$where = implode(' AND ', $where);
$columns = $options['titlecolumn'];
if ( isset($options['datacolumn']) )
$columns .= ", {$options['datacolumn']}";
if ( isset($options['additionalcolumns']) )
$columns .= ', ' . implode(', ', $options['additionalcolumns']);
$additionalwhere = ( isset($options['additionalwhere']) ) ? $options['additionalwhere'] : '';
$sql = "SELECT $columns FROM " . table_prefix . "{$options['table']} WHERE ( $where ) $additionalwhere;";
if ( !($q = $db->sql_unbuffered_query($sql)) )
{
$db->_die('Automatically generated search query');
}
if ( $row = $db->fetchrow() )
{
do
{
$parser = $template->makeParserText($options['uniqueid']);
$parser->assign_vars($row);
$idstring = $parser->run();
// Score this result
foreach ( $word_list as $term )
{
if ( $case_sensitive )
{
if ( strstr($row[$options['titlecolumn']], $term) )
{
( isset($scores[$idstring]) ) ? $scores[$idstring] += 1.5 : $scores[$idstring] = 1.5;
}
else if ( isset($options['datacolumn']) && strstr($row[$options['datacolumn']], $term) )
{
( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1;
}
}
else
{
if ( stristr($row[$options['titlecolumn']], $term) )
{
( isset($scores[$idstring]) ) ? $scores[$idstring] += 1.5 : $scores[$idstring] = 1.5;
}
else if ( isset($options['datacolumn']) && stristr($row[$options['datacolumn']], $term) )
{
( isset($scores[$idstring]) ) ? $scores[$idstring]++ : $scores[$idstring] = 1;
}
}
}
// Generate text...
$text = '';
if ( isset($options['datacolumn']) && !isset($options['formatcallback']) )
{
$text = highlight_and_clip_search_result(htmlspecialchars($row[$options['datacolumn']]), $word_list);
}
else if ( isset($options['formatcallback']) )
{
if ( is_callable($options['formatcallback']) )
{
$text = @call_user_func($options['formatcallback'], $row, $word_list);
}
else
{
$parser = $template->makeParserText($options['formatcallback']);
$parser->assign_vars($row);
$text = $parser->run();
}
}
// Inject result
if ( isset($scores[$idstring]) )
{
$parser = $template->makeParserText($options['linkformat']['page_id']);
$parser->assign_vars($row);
$page_id = $parser->run();
$parser = $template->makeParserText($options['linkformat']['namespace']);
$parser->assign_vars($row);
$namespace = $parser->run();
$page_data[$idstring] = array(
'page_name' => highlight_search_result(htmlspecialchars($row[$options['titlecolumn']]), $word_list),
'page_text' => $text,
'score' => $scores[$idstring],
'page_id' => $page_id,
'namespace' => $namespace,
);
// Any additional flags that need to be added to the result?
// The small usually-bracketed text to the left of the title
if ( isset($options['resultnote']) )
{
$page_data[$idstring]['page_note'] = $options['resultnote'];
}
// Should we include the length?
if ( isset($options['datacolumn']) )
{
$page_data[$idstring]['page_length'] = strlen($row[$options['datacolumn']]);
}
else
{
$page_data[$idstring]['page_length'] = 0;
$page_data[$idstring]['zero_length'] = true;
}
// Anything to append to result links?
if ( isset($options['linkformat']['append']) )
{
$page_data[$idstring]['url_append'] = $options['linkformat']['append'];
}
}
}
while ( $row = $db->fetchrow($q) );
$db->free_result($q);
}
}
}
?>