includes/wikiengine/Parse/Mediawiki/Url.php
author Dan
Mon, 28 Jan 2008 23:07:32 -0500
changeset 279 b6faa6d6ade2
parent 1 fe660c52c48f
permissions -rw-r--r--
Fixed case where HTML comments were getting stripped when opening tag not followed by whitespace (<!--foo--> was stripped, <!-- foo --> was not, neither is stripped now)

<?php

/**
* 
* Parse for URLS in the source text.
* 
* @category Text
* 
* @package Text_Wiki
* 
* @author Paul M. Jones <pmjones@php.net>
* 
* @author Moritz Venn <moritz.venn@freaque.net>
* 
* @license LGPL
* 
* @version $Id: Url.php,v 1.1 2005/12/06 15:54:56 ritzmo Exp $
* 
*/

/**
* 
* Parse for URLS in the source text.
* 
* Various URL markings are supported: inline (the URL by itself),
* inline (where the URL is enclosed in square brackets), and named
* reference (where the URL is enclosed in square brackets and has a
* name included inside the brackets).  E.g.:
*
* inline      -- http://example.com
* undescribed -- [http://example.com]
* described   -- [http://example.com Example Description]
* described   -- [http://www.example.com|Example Description]
*
* When rendering a URL token, this will convert URLs pointing to a .gif,
* .jpg, or .png image into an inline <img /> tag (for the 'xhtml'
* format).
*
* Token options are:
* 
* 'type' => ['inline'|'footnote'|'descr'] the type of URL
* 
* 'href' => the URL link href portion
* 
* 'text' => the displayed text of the URL link
* 
* @category Text
* 
* @package Text_Wiki
* 
* @author Paul M. Jones <pmjones@php.net>
* 
* @author Moritz Venn <moritz.venn@freaque.net>
* 
*/

class Text_Wiki_Parse_Url extends Text_Wiki_Parse {
    
    
    /**
    * 
    * Keeps a running count of numbered-reference URLs.
    * 
    * @access public
    * 
    * @var int
    * 
    */
    
    var $footnoteCount = 0;
    
    
    /**
    * 
    * URL schemes recognized by this rule.
    * 
    * @access public
    * 
    * @var array
    * 
    */
    
    var $conf = array(
        'schemes' => array(
            'http://',
            'https://',
            'ftp://',
            'gopher://',
            'news://',
            'mailto:',
            'irc://'
        )
    );
    
    
    /**
    * 
    * Constructor.
    * 
    * We override the constructor so we can comment the regex nicely.
    * 
    * @access public
    * 
    */
    
    function Text_Wiki_Parse_Url(&$obj)
    {
        parent::Text_Wiki_Parse($obj);
        
        // convert the list of recognized schemes to a regex-safe string,
        // where the pattern delim is a slash
        $tmp = array();
        $list = $this->getConf('schemes', array());
        foreach ($list as $val) {
            $tmp[] = preg_quote($val, '/');
        }
        $schemes = implode('|', $tmp);
        
        // build the regex
        $this->regex =
            "($schemes)" . // allowed schemes
            "(" . // start pattern
            "[^ \\/\"\'{$this->wiki->delim}]*\\/" . // no spaces, backslashes, slashes, double-quotes, single quotes, or delimiters;
            ")*" . // end pattern
            "[^ \\t\\n\\/\"\'{$this->wiki->delim}]*" .
            "[A-Za-z0-9\\/?=&~_]";
            // fix for jEdit syntax highlighting bug: \"
    }
    
    
    /**
    * 
    * Find three different kinds of URLs in the source text.
    *
    * @access public
    * 
    */
    
    function parse()
    {
        // -------------------------------------------------------------
        // 
        // Described-reference (named) URLs.
        // 

        // the regular expression for this kind of URL
        $tmp_regex = '/\[(' . $this->regex . ')[ |]([^\]]+)\]/';

        // use a custom callback processing method to generate
        // the replacement text for matches.
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'processDescr'),
            $this->wiki->source
        );

        
        // -------------------------------------------------------------
        // 
        // Unnamed-reference ('Ordinary'-style) URLs.
        // 
        
        // the regular expression for this kind of URL
        $tmp_regex = '/\[(' . $this->regex . ')\]/U';
        
        // use a custom callback processing method to generate
        // the replacement text for matches.
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            //array(&$this, 'processFootnote'),
            array(&$this, 'processOrdinary'),
            $this->wiki->source
        );
        
        
        // -------------------------------------------------------------
        // 
        // Normal inline URLs.
        // 
        
        /*
        
        ## DISABLED FOR ENANO
        ## This messes up HTML links.
        
        // the regular expression for this kind of URL
        
        $tmp_regex = '/(^|[^A-Za-z])(' . $this->regex . ')(.*?)/';
        
        // use the standard callback for inline URLs
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'process'),
            $this->wiki->source
        );

        //$tmp_regex = '/(^|[^A-Za-z])([a-zA-Z])(.*?)/';
        $tmp_regex = '/(^|\s)([a-zA-Z0-9\-]+\.[a-zA-Z0-9\-]+(\.[a-zA-Z0-9\-]+)+)($|\s)/';
        
        // use the standard callback for inline URLs
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'processWithoutProtocol'),
            $this->wiki->source
        );

        $tmp_regex = '/(^|\s|'.$this->wiki->delim.')<([a-zA-Z0-9\-\.%_\+\!\*\'\(\)\,]+@[a-zA-Z0-9\-]+(\.[a-zA-Z0-9\-]+)+)>(\s|'.$this->wiki->delim.'|$)/';
        
        // use the standard callback for inline URLs
        $this->wiki->source = preg_replace_callback(
            $tmp_regex,
            array(&$this, 'processInlineEmail'),
            $this->wiki->source
        );
        */
    }
    
    
    /**
    * 
    * Process inline URLs.
    * 
    * @param array &$matches
    * 
    * @param array $matches An array of matches from the parse() method
    * as generated by preg_replace_callback.  $matches[0] is the full
    * matched string, $matches[1] is the first matched pattern,
    * $matches[2] is the second matched pattern, and so on.
    * 
    * @return string The processed text replacement.
    * 
    */ 
    
    function process(&$matches)
    {
        // set options
        $options = array(
            'type' => 'inline',
            'href' => $matches[2],
            'text' => $matches[2]
        );
        
        // tokenize
        return $matches[1] . $this->wiki->addToken($this->rule, $options) . $matches[5];
    }

    function processWithoutProtocol(&$matches)
    {
        // set options
        $options = array(
            'type' => 'inline',
            'href' => 'http://'.$matches[2],
            'text' => $matches[2]
        );
        
        // tokenize
        return $matches[1] . $this->wiki->addToken($this->rule, $options) . $matches[4];
    }

    function processInlineEmail(&$matches)
    {
        // set options
        $options = array(
            'type' => 'inline',
            'href' => 'mailto://'.$matches[2],
            'text' => $matches[2]
        );
        
        // tokenize
        return $matches[1] . $this->wiki->addToken($this->rule, $options) . $matches[4];
    }    
    
    /**
    * 
    * Process numbered (footnote) URLs.
    * 
    * Token options are:
    * @param array &$matches
    * 
    * @param array $matches An array of matches from the parse() method
    * as generated by preg_replace_callback.  $matches[0] is the full
    * matched string, $matches[1] is the first matched pattern,
    * $matches[2] is the second matched pattern, and so on.
    * 
    * @return string The processed text replacement.
    * 
    */ 
    
    function processFootnote(&$matches)
    {
        // keep a running count for footnotes 
        $this->footnoteCount++;
        
        // set options
        $options = array(
            'type' => 'footnote',
            'href' => $matches[1],
            'text' => $this->footnoteCount
        );
        
        // tokenize
        return $this->wiki->addToken($this->rule, $options);
    }
    
     function processOrdinary(&$matches)
    {
    	// keep a running count for footnotes 
        $this->footnoteCount++;
        
        // set options
        $options = array(
            'type' => 'descr',
            'href' => $matches[1],
            'text' => $matches[1]
        );
        
        // tokenize
        return $this->wiki->addToken($this->rule, $options);
    }
    
    
    /**
    * 
    * Process described-reference (named-reference) URLs.
    * 
    * Token options are:
    *     'type' => ['inline'|'footnote'|'descr'] the type of URL
    *     'href' => the URL link href portion
    *     'text' => the displayed text of the URL link
    * 
    * @param array &$matches
    * 
    * @param array $matches An array of matches from the parse() method
    * as generated by preg_replace_callback.  $matches[0] is the full
    * matched string, $matches[1] is the first matched pattern,
    * $matches[2] is the second matched pattern, and so on.
    * 
    * @return string The processed text replacement.
    * 
    */ 
    
    function processDescr(&$matches)
    {
        // set options
        $options = array(
            'type' => 'descr',
            'href' => $matches[1],
            'text' => $matches[4]
        );

        // tokenize
        return $this->wiki->addToken($this->rule, $options);
    }
}
?>